Skip to content

Commit 540b24b

Browse files
jan-wassenbergcopybara-github
authored andcommitted
Fix bench_matmul perf regression: A input should be padded
PiperOrigin-RevId: 781940882
1 parent 4bc44d5 commit 540b24b

4 files changed

Lines changed: 27 additions & 17 deletions

File tree

compression/test_util-inl.h

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -69,45 +69,51 @@ void ForeachPackedAndRawType() {
6969

7070
// Generates inputs: deterministic, within max SfpStream range.
7171
template <typename MatT>
72-
MatStorageT<MatT> GenerateMat(const Extents2D& extents, hwy::ThreadPool& pool) {
72+
MatStorageT<MatT> GenerateMat(const Extents2D& extents, MatPadding padding,
73+
hwy::ThreadPool& pool) {
7374
gcpp::CompressWorkingSet ws;
75+
ws.tls.resize(pool.NumWorkers());
7476
MatStorageT<float> raw("raw", extents, MatPadding::kPacked);
75-
MatStorageT<MatT> compressed("mat", extents, MatPadding::kPacked);
77+
MatStorageT<MatT> compressed("mat", extents, padding);
7678
const float scale = SfpStream::kMax / extents.Area();
77-
pool.Run(0, extents.rows, [&](const size_t r, size_t /*thread*/) {
79+
pool.Run(0, extents.rows, [&](const size_t r, size_t thread) {
7880
float* HWY_RESTRICT row = raw.Row(r);
7981
for (size_t c = 0; c < extents.cols; c++) {
8082
float f = static_cast<float>(r * extents.cols + c) * scale;
8183
if ((r + c) & 1) f = -f; // Also generate some negative values.
8284
row[c] = f;
8385
}
86+
Compress(raw.Row(r), raw.Cols(), ws.tls[thread],
87+
MakeSpan(compressed.Row(r), compressed.Cols()),
88+
/*packed_ofs=*/0);
8489
});
8590

86-
Compress(raw.PackedScale1(), raw.Extents().Area(), ws, compressed.Span(),
87-
/*packed_ofs=*/0, pool);
8891
compressed.SetScale(0.6f); // Arbitrary value, different from 1.
8992
return compressed;
9093
}
9194

92-
// `extents` describes the transposed matrix.
95+
// Same, but `extents` describes the transposed matrix.
9396
template <typename MatT>
9497
MatStorageT<MatT> GenerateTransposedMat(const Extents2D extents,
98+
MatPadding padding,
9599
hwy::ThreadPool& pool) {
96100
gcpp::CompressWorkingSet ws;
101+
ws.tls.resize(pool.NumWorkers());
97102
MatStorageT<float> raw("raw", extents, MatPadding::kPacked);
98-
MatStorageT<MatT> compressed("trans", extents, MatPadding::kPacked);
103+
MatStorageT<MatT> compressed("trans", extents, padding);
99104
const float scale = SfpStream::kMax / extents.Area();
100-
pool.Run(0, extents.rows, [&](const size_t r, size_t /*thread*/) {
105+
pool.Run(0, extents.rows, [&](const size_t r, size_t thread) {
101106
float* HWY_RESTRICT row = raw.Row(r);
102107
for (size_t c = 0; c < extents.cols; c++) {
103108
float f = static_cast<float>(c * extents.rows + r) * scale;
104109
if ((r + c) & 1) f = -f; // Also generate some negative values.
105110
row[c] = f;
106111
}
112+
Compress(raw.Row(r), raw.Cols(), ws.tls[thread],
113+
MakeSpan(compressed.Row(r), compressed.Cols()),
114+
/*packed_ofs=*/0);
107115
});
108116

109-
Compress(raw.PackedScale1(), raw.Extents().Area(), ws, compressed.Span(),
110-
/*packed_ofs=*/0, pool);
111117
// Arbitrary value, different from 1, must match `GenerateMat`.
112118
compressed.SetScale(0.6f);
113119
return compressed;

ops/bench_matmul.cc

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -89,12 +89,14 @@ void BenchMatMul(size_t M, size_t K, size_t N, bool add, MatMulEnv& env) {
8989

9090
MatStorageT<float> add_storage("add", Extents2D(), MatPadding::kPacked);
9191
if (add) {
92-
add_storage = GenerateMat<float>(Extents2D(1, N), pool);
92+
add_storage =
93+
GenerateMat<float>(Extents2D(1, N), MatPadding::kPacked, pool);
9394
add_storage.SetScale(1.0f);
9495
}
9596

96-
MatStorageT<TA> a = GenerateMat<TA>(A_extents, pool);
97-
MatStorageT<TB> b_trans = GenerateTransposedMat<TB>(B_extents, pool);
97+
MatStorageT<TA> a = GenerateMat<TA>(A_extents, MatPadding::kOdd, pool);
98+
MatStorageT<TB> b_trans =
99+
GenerateTransposedMat<TB>(B_extents, MatPadding::kOdd, pool);
98100

99101
const float* add_row = add ? add_storage.PackedScale1() : nullptr;
100102

ops/matmul-inl.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1298,7 +1298,7 @@ struct MMImpl {
12981298
// `K = B.Cols()`, which must match `A.Cols()`, is the number
12991299
// of rows in the original B. `N = C.Cols()` must be a multiple of 4. There
13001300
// are no other restrictions on shape, though performance is better when `M % 4
1301-
// == 0` or `M <= 4`.
1301+
// == 0` or `M <= 4`, and when A is padded (`!A.IsPacked()`).
13021302
//
13031303
// NOTE: if A and/or B are BF16 and padded, the interval `[Cols(),
13041304
// hwy::RoundUpTo(Cols(), hn::Lanes(dbf))` must be zero-initialized to match

ops/matmul_test.cc

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -219,14 +219,16 @@ void TestMatMul(size_t rows_ac, size_t cols_a_rows_b, size_t cols_bc, bool add,
219219
const Extents2D B_extents(cols_bc, cols_a_rows_b); // already transposed
220220
const Extents2D C_extents(rows_ac, cols_bc);
221221

222-
MatStorageT<TA> A(GenerateMat<TA>(A_extents, pool));
223-
MatStorageT<TB> BT(GenerateTransposedMat<TB>(B_extents, pool));
222+
MatStorageT<TA> A(GenerateMat<TA>(A_extents, MatPadding::kOdd, pool));
223+
// Must be packed because we call Span() on it.
224+
MatStorageT<TB> BT(
225+
GenerateTransposedMat<TB>(B_extents, MatPadding::kPacked, pool));
224226
MatStorageT<TC> C_slow("C_slow", C_extents, MatPadding::kOdd);
225227
MatStorageT<TC> C("C", C_extents, MatPadding::kOdd);
226228
C.AllocateAndAttachRowPtrs(env.row_ptrs);
227229

228230
MatStorageT<float> add_storage =
229-
add ? GenerateMat<float>(Extents2D(1, cols_bc), pool)
231+
add ? GenerateMat<float>(Extents2D(1, cols_bc), MatPadding::kPacked, pool)
230232
: MatStorageT<float>("add", Extents2D(), MatPadding::kPacked);
231233
add_storage.SetScale(1.0f);
232234
const float* add_row = add ? add_storage.PackedScale1() : nullptr;

0 commit comments

Comments
 (0)