Skip to content

Commit 3ed403e

Browse files
jan-wassenbergcopybara-github
authored andcommitted
Major cleanup of profiler zones, add Caller annotation for all pool.Run
Pass ThreadingContext instead of Pools/Profiler individually, for access to Zones Add GCPP_ZONE helper Add Caller argument to pool.Run to enable new stats Remove most direct dependencies on ThreadPool, prefer ParallelFor PiperOrigin-RevId: 822934530
1 parent 9e8ac7e commit 3ed403e

43 files changed

Lines changed: 810 additions & 591 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

BUILD.bazel

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,6 @@ cc_library(
9595
":topology",
9696
# Placeholder for container detection, do not remove
9797
"@highway//:hwy",
98-
"@highway//:profiler",
9998
"@highway//:thread_pool",
10099
"@highway//:topology",
101100
],
@@ -124,7 +123,9 @@ cc_library(
124123
srcs = ["util/zones.cc"],
125124
hdrs = ["util/zones.h"],
126125
deps = [
126+
"@highway//:hwy",
127127
"@highway//:profiler",
128+
"@highway//:thread_pool",
128129
],
129130
)
130131

@@ -258,7 +259,6 @@ cc_library(
258259
"//io:fields",
259260
"@highway//:hwy",
260261
"@highway//:profiler",
261-
"@highway//:thread_pool",
262262
],
263263
)
264264

@@ -278,7 +278,6 @@ cc_library(
278278
"//io:blob_store",
279279
"@highway//:hwy",
280280
"@highway//:profiler",
281-
"@highway//:thread_pool",
282281
],
283282
)
284283

@@ -309,7 +308,6 @@ cc_library(
309308
deps = [
310309
":allocator",
311310
":basics",
312-
":configs",
313311
":mat",
314312
":threading",
315313
":threading_context",
@@ -397,7 +395,6 @@ cc_library(
397395
"@highway//:hwy",
398396
"@highway//:math",
399397
"@highway//:profiler",
400-
"@highway//:thread_pool",
401398
"@highway//hwy/contrib/sort:vqsort",
402399
],
403400
)
@@ -424,7 +421,6 @@ cc_test(
424421
"@highway//:nanobenchmark", #buildcleaner: keep
425422
"@highway//:profiler",
426423
"@highway//:stats",
427-
"@highway//:thread_pool",
428424
],
429425
)
430426

@@ -507,7 +503,6 @@ cc_test(
507503
"@highway//:hwy_test_util",
508504
"@highway//:nanobenchmark",
509505
"@highway//:profiler",
510-
"@highway//:thread_pool",
511506
],
512507
)
513508

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ set(CMAKE_CXX_STANDARD 17)
2222
set(CMAKE_CXX_STANDARD_REQUIRED ON)
2323
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
2424

25-
FetchContent_Declare(highway GIT_REPOSITORY https://github.com/google/highway.git GIT_TAG 9781a1698ee0756ef1eaaf96930113ed7cb6d3ee EXCLUDE_FROM_ALL)
25+
FetchContent_Declare(highway GIT_REPOSITORY https://github.com/google/highway.git GIT_TAG 2a16a50ff61071bb25ddef0ce35d92b0e2b9c579 EXCLUDE_FROM_ALL)
2626
FetchContent_MakeAvailable(highway)
2727

2828
## Note: absl needs to be installed by sentencepiece. This will only happen if

MODULE.bazel

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ bazel_dep(name = "google_benchmark", version = "1.8.5")
1818
# Require a more recent version.
1919
git_override(
2020
module_name = "highway",
21-
commit = "9781a1698ee0756ef1eaaf96930113ed7cb6d3ee",
21+
commit = "2a16a50ff61071bb25ddef0ce35d92b0e2b9c579",
2222
remote = "https://github.com/google/highway",
2323
)
2424

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -452,7 +452,7 @@ FetchContent_MakeAvailable(sentencepiece)
452452
FetchContent_Declare(gemma GIT_REPOSITORY https://github.com/google/gemma.cpp GIT_TAG origin/main)
453453
FetchContent_MakeAvailable(gemma)
454454
455-
FetchContent_Declare(highway GIT_REPOSITORY https://github.com/google/highway.git GIT_TAG 9781a1698ee0756ef1eaaf96930113ed7cb6d3ee)
455+
FetchContent_Declare(highway GIT_REPOSITORY https://github.com/google/highway.git GIT_TAG 2a16a50ff61071bb25ddef0ce35d92b0e2b9c579)
456456
FetchContent_MakeAvailable(highway)
457457
```
458458

compression/BUILD.bazel

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,9 +120,9 @@ cc_library(
120120
":compress",
121121
":distortion",
122122
"//:mat",
123+
"//:threading_context",
123124
"@highway//:hwy",
124125
"@highway//:hwy_test_util",
125-
"@highway//:thread_pool",
126126
],
127127
)
128128

@@ -180,6 +180,7 @@ cc_library(
180180
":sfp",
181181
"//:basics",
182182
"//:mat",
183+
"//:threading_context",
183184
"@highway//:hwy",
184185
"@highway//:nanobenchmark",
185186
"@highway//:profiler",
@@ -203,9 +204,9 @@ cc_test(
203204
":test_util",
204205
"@googletest//:gtest_main", # buildcleaner: keep
205206
"//:test_util",
207+
"//:threading_context",
206208
"@highway//:hwy",
207209
"@highway//:hwy_test_util",
208-
"@highway//:thread_pool",
209210
],
210211
)
211212

compression/compress-inl.h

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,10 @@
2626

2727
#include "compression/compress.h" // IWYU pragma: export
2828
#include "compression/distortion.h"
29+
#include "util/threading_context.h"
2930
#include "hwy/aligned_allocator.h"
3031
#include "hwy/base.h"
3132
#include "hwy/contrib/thread_pool/thread_pool.h"
32-
#include "hwy/timer.h"
3333

3434
#if COMPRESS_STATS
3535
#include <cmath> // lroundf
@@ -493,13 +493,13 @@ struct CompressTraits<NuqStream> {
493493
}
494494
};
495495

496-
// Compresses `num` elements of `raw` to `packed` starting at `packed_ofs`,
497-
// which is useful for compressing sub-regions of an array.
496+
// DEPRECATED: Use the overload with ThreadingContext instead.
498497
template <typename Packed>
499498
HWY_NOINLINE void Compress(const float* HWY_RESTRICT raw, size_t num,
500499
CompressWorkingSet& work,
501500
const PackedSpan<Packed>& packed,
502-
const size_t packed_ofs, hwy::ThreadPool& pool) {
501+
const size_t packed_ofs, hwy::ThreadPool& pool,
502+
hwy::pool::Caller caller = hwy::pool::Caller()) {
503503
packed.BoundsCheck(packed_ofs, num);
504504
work.tls.resize(pool.NumWorkers());
505505
if constexpr (COMPRESS_STATS) {
@@ -511,7 +511,7 @@ HWY_NOINLINE void Compress(const float* HWY_RESTRICT raw, size_t num,
511511
using Traits = CompressTraits<hwy::RemoveConst<Packed>>;
512512
constexpr size_t kBatch = 8192;
513513
const size_t num_batches = hwy::DivCeil(num, kBatch);
514-
pool.Run(0, num_batches,
514+
pool.Run(0, num_batches, caller,
515515
[&](const uint32_t idx_batch, size_t thread) HWY_ATTR {
516516
const hn::ScalableTag<float> df;
517517

@@ -530,6 +530,17 @@ HWY_NOINLINE void Compress(const float* HWY_RESTRICT raw, size_t num,
530530
}
531531
}
532532

533+
// Compresses `num` elements of `raw` to `packed` starting at `packed_ofs`,
534+
// which is useful for compressing sub-regions of an array.
535+
template <typename Packed>
536+
HWY_NOINLINE void Compress(const float* HWY_RESTRICT raw, size_t num,
537+
CompressWorkingSet& work,
538+
const PackedSpan<Packed>& packed,
539+
const size_t packed_ofs, ThreadingContext& ctx) {
540+
Compress(raw, num, work, packed, packed_ofs, ctx.pools.Pool(),
541+
ctx.pool_callers.Get(Callers::kCompress));
542+
}
543+
533544
// Same as above, but without parallelization nor benchmarking.
534545
template <typename Packed>
535546
HWY_NOINLINE void Compress(const float* HWY_RESTRICT raw, size_t num,

compression/compress_test.cc

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,9 @@
2424
#include "compression/compress.h"
2525
#include "compression/distortion.h"
2626
#include "util/test_util.h"
27+
#include "util/threading_context.h"
2728
#include "hwy/aligned_allocator.h"
2829
#include "hwy/base.h"
29-
#include "hwy/contrib/thread_pool/thread_pool.h"
3030
#include "hwy/tests/hwy_gtest.h"
3131
// clang-format off
3232
#undef HWY_TARGET_INCLUDE
@@ -42,14 +42,27 @@ namespace gcpp {
4242
namespace HWY_NAMESPACE {
4343
namespace hn = hwy::HWY_NAMESPACE;
4444

45+
static ThreadingArgs SingleThreadArgs() {
46+
ThreadingArgs args;
47+
args.max_lps = 1;
48+
return args;
49+
}
50+
51+
static ThreadingContext& Ctx() {
52+
static ThreadingContext* ctx = new ThreadingContext(SingleThreadArgs());
53+
return *ctx;
54+
}
55+
4556
// Calls Compress and Decompress2 and verifies the distortion/error.
4657
template <typename Packed>
4758
struct TestDecompress2 {
4859
template <typename T, class D>
4960
HWY_INLINE void operator()(T /*unused*/, D d) {
5061
const size_t N = hn::Lanes(d);
5162
CompressWorkingSet work;
52-
hwy::ThreadPool pool(0);
63+
ThreadingArgs args;
64+
args.max_lps = 1;
65+
ThreadingContext ctx(args);
5366
hwy::RandomState rng;
5467

5568
const size_t num = 2 * N;
@@ -68,7 +81,7 @@ struct TestDecompress2 {
6881
// Short inputs fail VerifyGaussian.
6982

7083
const size_t packed_ofs = 0;
71-
Compress(raw.get(), num, work, packed_span, packed_ofs, pool);
84+
Compress(raw.get(), num, work, packed_span, packed_ofs, ctx);
7285
hn::Vec<D> raw0, raw1;
7386
Decompress2(d, MakeConst(packed_span), packed_ofs, raw0, raw1);
7487
hn::Store(raw0, d, dec.get());
@@ -129,7 +142,6 @@ struct TestShortLengths {
129142
HWY_INLINE void operator()(T /*unused*/, D d) {
130143
const size_t N = hn::Lanes(d);
131144
CompressWorkingSet work;
132-
hwy::ThreadPool pool(0);
133145
hwy::RandomState rng;
134146

135147
for (size_t num = 1; num < 5 * hn::Lanes(d); ++num) {
@@ -149,7 +161,7 @@ struct TestShortLengths {
149161
// Short inputs fail VerifyGaussian.
150162

151163
const size_t packed_ofs = 0;
152-
Compress(raw.get(), num, work, packed_span, packed_ofs, pool);
164+
Compress(raw.get(), num, work, packed_span, packed_ofs, Ctx());
153165
DecompressAndZeroPad(d, MakeConst(packed_span), packed_ofs, dec.get(),
154166
num);
155167

compression/python/compression_clif_aux.cc

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@
3737
#include "util/basics.h"
3838
#include "util/mat.h"
3939
#include "util/threading_context.h"
40-
#include "hwy/contrib/thread_pool/thread_pool.h"
4140

4241
#undef HWY_TARGET_INCLUDE
4342
#define HWY_TARGET_INCLUDE \
@@ -57,9 +56,6 @@ class SbsWriterImpl : public ISbsWriter {
5756
template <typename Packed>
5857
void InsertT(const char* name, F32Span weights,
5958
const TensorInfo& tensor_info) {
60-
// TODO(janwas): 1D parallel-for.
61-
hwy::ThreadPool& pool = ctx_.pools.Pool();
62-
6359
MatPtrT<Packed> mat(name, ExtentsFromInfo(&tensor_info));
6460
// SFP and NUQ (which uses SFP for cluster centers) have a limited range
6561
// and depending on the input values may require rescaling. Scaling is
@@ -82,21 +78,20 @@ class SbsWriterImpl : public ISbsWriter {
8278
// succeeds, but we only have 10 floats, not the full tensor.
8379
if (weights.size() == 10 && mat.Extents().Area() != 10) {
8480
Compress(weights.data(), weights.size(), working_set_, mat.Span(),
85-
/*packed_ofs=*/0, pool);
81+
/*packed_ofs=*/0, ctx_);
8682
writer_.Add(name, mat.Packed(), mat.ElementBytes() * 10);
8783
return;
8884
}
8985

9086
HWY_ASSERT(weights.size() == mat.Extents().Area());
9187
Compress(weights.data(), weights.size(), working_set_, mat.Span(),
92-
/*packed_ofs=*/0, pool);
88+
/*packed_ofs=*/0, ctx_);
9389
writer_.Add(name, mat.Packed(), mat.PackedBytes());
9490
}
9591

9692
public:
9793
SbsWriterImpl(const std::string& sbs_path)
98-
: ctx_(ThreadingArgs()),
99-
writer_(gcpp::Path(sbs_path), ctx_.pools.Pool()) {}
94+
: ctx_(ThreadingArgs()), writer_(gcpp::Path(sbs_path), ctx_) {}
10095

10196
void Insert(const char* name, F32Span weights, Type type,
10297
const TensorInfo& tensor_info) override {

compression/test_util-inl.h

Lines changed: 36 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
// IWYU pragma: end_exports
2424

2525
#include "compression/compress.h"
26-
#include "hwy/contrib/thread_pool/thread_pool.h"
26+
#include "util/threading_context.h"
2727

2828
#endif // THIRD_PARTY_GEMMA_CPP_COMPRESSION_TEST_UTIL_INL_H_
2929

@@ -98,25 +98,26 @@ void ForeachActivationType3(D d) {
9898

9999
// Generates inputs: deterministic, within max SfpStream range.
100100
template <typename MatT>
101-
MatStorageT<MatT> GenerateMat(const Extents2D& extents,
102-
const Allocator& allocator, MatPadding padding,
103-
hwy::ThreadPool& pool) {
101+
MatStorageT<MatT> GenerateMat(const Extents2D& extents, MatPadding padding,
102+
ThreadingContext& ctx) {
104103
gcpp::CompressWorkingSet ws;
105-
ws.tls.resize(pool.NumWorkers());
106-
MatStorageT<float> raw("raw", extents, allocator, MatPadding::kPacked);
107-
MatStorageT<MatT> compressed("mat", extents, allocator, padding);
104+
ws.tls.resize(ctx.pools.MaxWorkers());
105+
MatStorageT<float> raw("raw", extents, ctx.allocator, MatPadding::kPacked);
106+
MatStorageT<MatT> compressed("mat", extents, ctx.allocator, padding);
108107
const float scale = SfpStream::kMax / extents.Area();
109-
pool.Run(0, extents.rows, [&](const size_t r, size_t thread) {
110-
float* HWY_RESTRICT row = raw.Row(r);
111-
for (size_t c = 0; c < extents.cols; c++) {
112-
float f = static_cast<float>(r * extents.cols + c) * scale;
113-
if ((r + c) & 1) f = -f; // Also generate some negative values.
114-
row[c] = f;
115-
}
116-
Compress(raw.Row(r), raw.Cols(), ws.tls[thread],
117-
MakeSpan(compressed.Row(r), extents.cols),
118-
/*packed_ofs=*/0);
119-
});
108+
ParallelFor(ParallelismStrategy::kFlat, extents.rows, ctx, /*cluster_idx=*/0,
109+
Callers::kTest, [&](size_t r, size_t thread) {
110+
float* HWY_RESTRICT row = raw.Row(r);
111+
for (size_t c = 0; c < extents.cols; c++) {
112+
float f = static_cast<float>(r * extents.cols + c) * scale;
113+
if ((r + c) & 1)
114+
f = -f; // Also generate some negative values.
115+
row[c] = f;
116+
}
117+
Compress(raw.Row(r), raw.Cols(), ws.tls[thread],
118+
MakeSpan(compressed.Row(r), extents.cols),
119+
/*packed_ofs=*/0);
120+
});
120121

121122
compressed.SetScale(0.6f); // Arbitrary value, different from 1.
122123
return compressed;
@@ -126,25 +127,26 @@ MatStorageT<MatT> GenerateMat(const Extents2D& extents,
126127
// `f` swaps `r` and `c`.
127128
template <typename MatT>
128129
MatStorageT<MatT> GenerateTransposedMat(const Extents2D extents,
129-
const Allocator& allocator,
130130
MatPadding padding,
131-
hwy::ThreadPool& pool) {
131+
ThreadingContext& ctx) {
132132
gcpp::CompressWorkingSet ws;
133-
ws.tls.resize(pool.NumWorkers());
134-
MatStorageT<float> raw("raw", extents, allocator, MatPadding::kPacked);
135-
MatStorageT<MatT> compressed("trans", extents, allocator, padding);
133+
ws.tls.resize(ctx.pools.MaxWorkers());
134+
MatStorageT<float> raw("raw", extents, ctx.allocator, MatPadding::kPacked);
135+
MatStorageT<MatT> compressed("trans", extents, ctx.allocator, padding);
136136
const float scale = SfpStream::kMax / extents.Area();
137-
pool.Run(0, extents.rows, [&](const size_t r, size_t thread) {
138-
float* HWY_RESTRICT row = raw.Row(r);
139-
for (size_t c = 0; c < extents.cols; c++) {
140-
float f = static_cast<float>(c * extents.rows + r) * scale;
141-
if ((r + c) & 1) f = -f; // Also generate some negative values.
142-
row[c] = f;
143-
}
144-
Compress(raw.Row(r), raw.Cols(), ws.tls[thread],
145-
MakeSpan(compressed.Row(r), extents.cols),
146-
/*packed_ofs=*/0);
147-
});
137+
ParallelFor(ParallelismStrategy::kFlat, extents.rows, ctx, /*cluster_idx=*/0,
138+
Callers::kTest, [&](size_t r, size_t thread) {
139+
float* HWY_RESTRICT row = raw.Row(r);
140+
for (size_t c = 0; c < extents.cols; c++) {
141+
float f = static_cast<float>(c * extents.rows + r) * scale;
142+
if ((r + c) & 1)
143+
f = -f; // Also generate some negative values.
144+
row[c] = f;
145+
}
146+
Compress(raw.Row(r), raw.Cols(), ws.tls[thread],
147+
MakeSpan(compressed.Row(r), extents.cols),
148+
/*packed_ofs=*/0);
149+
});
148150

149151
// Arbitrary value, different from 1, must match `GenerateMat`.
150152
compressed.SetScale(0.6f);

0 commit comments

Comments
 (0)