Skip to content

Commit 46bae8a

Browse files
[V4] Fold algorithm (#107)
* TMP benchmarks * fold * Revert "TMP benchmarks" This reverts commit 90b94e3. * benchmarks * format * spell * pass 2 * dedupe * use macro * no namespace * register benchmarks individually * dry run * Revert "dry run" This reverts commit 2eedb57. * Revert "register benchmarks individually" This reverts commit 4f4f19a. * drop gb test * drop counters * clean ups * drop alias * use range * tmp * powers of KiB * diff mode * simplify * more simplification * items only * renames * new style tmp2 * complete new style * special path for n=1 * optimize * decltype * special case handling * clean ups * todo * drop unneeded * serial does type promotion * less benchmarks * multithreaded benchmarks * shorter name * omit slow benchmarks * drop small reduction benchmark * use invoke * extra tests * init explicit * optional API * drop duplicates * exception support * tidy up * more clean ups * add default_movable constraint * fix test
1 parent 96e2718 commit 46bae8a

13 files changed

Lines changed: 973 additions & 15 deletions

File tree

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ target_sources(libfork_libfork
106106
# libfork.algorithm
107107
src/algorithm/algorithm.cxx
108108
src/algorithm/for_each.cxx
109+
src/algorithm/fold.cxx
109110
src/algorithm/concepts.cxx
110111
PRIVATE
111112
src/exception.cpp

benchmark/lib/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ target_sources(benchmark_common
1010
BASE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}
1111
FILES
1212
fib.hpp
13+
fold.hpp
1314
uts.hpp
1415
macros.hpp
1516
)

benchmark/lib/fold.hpp

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
#pragma once
2+
3+
#include <benchmark/benchmark.h>
4+
5+
#include "macros.hpp"
6+
7+
#ifdef LF_BENCH_NO_IMPORT_STD
8+
#include <concepts>
9+
#include <cstddef>
10+
#include <cstdint>
11+
#include <format>
12+
#include <functional>
13+
#include <new>
14+
#include <ranges>
15+
#include <span>
16+
#include <type_traits>
17+
#include <utility>
18+
#include <vector>
19+
#else
20+
import std;
21+
#endif
22+
23+
inline constexpr std::int64_t fold_test = 10;
24+
25+
inline constexpr std::int64_t fold_1024 = 1'024;
26+
inline constexpr std::int64_t fold_1024_base = fold_1024;
27+
inline constexpr std::int64_t fold_1024_sq_base = fold_1024 * fold_1024;
28+
inline constexpr std::int64_t fold_1024_cu_base = fold_1024 * fold_1024 * fold_1024;
29+
30+
enum class fold_data_mode : char { memory, lazy };
31+
enum class fold_chunk_mode : char { explicit_one, deduced, fixed };
32+
enum class fold_projection_mode : char { sync, async };
33+
34+
template <typename T>
35+
constexpr auto fold_value(std::size_t index) -> T {
36+
return static_cast<T>(index % 4UZ);
37+
}
38+
39+
template <typename T>
40+
constexpr auto make_fold_range(std::size_t count) {
41+
return std::views::iota(std::size_t{}, count) | std::views::transform([](std::size_t index) -> T {
42+
return fold_value<T>(index);
43+
});
44+
}
45+
46+
template <typename T>
47+
using fold_accum_t = std::conditional_t<std::same_as<T, float>, double, std::int64_t>;
48+
49+
template <typename T>
50+
constexpr auto expected_fold_result(std::size_t count) -> fold_accum_t<T> {
51+
auto groups = count / 4UZ;
52+
auto remainder = count % 4UZ;
53+
return static_cast<fold_accum_t<T>>((groups * 6UZ) + ((remainder * (remainder - 1UZ)) / 2UZ));
54+
}
55+
56+
template <typename T>
57+
auto fold_result_is_correct(fold_accum_t<T> result, fold_accum_t<T> expect) -> bool {
58+
if constexpr (std::floating_point<fold_accum_t<T>>) {
59+
return std::abs(result - expect) <= 1e-6;
60+
} else {
61+
return result == expect;
62+
}
63+
}
64+
65+
template <fold_data_mode Data, typename T, typename Fn>
66+
void run_fold_input(benchmark::State &state, Fn &&fn) {
67+
68+
auto n = static_cast<std::size_t>(state.range(0));
69+
auto expect = expected_fold_result<T>(n);
70+
71+
auto bench = [&](auto range) -> void {
72+
for (auto _ : state) {
73+
if (auto result = std::invoke(fn, range); !fold_result_is_correct<T>(result, expect)) {
74+
state.SkipWithError(std::format("incorrect result: {} != {}", result, expect));
75+
break;
76+
}
77+
}
78+
};
79+
80+
if constexpr (Data == fold_data_mode::memory) {
81+
bench(make_fold_range<T>(n) | std::ranges::to<std::vector<T>>());
82+
} else {
83+
bench(make_fold_range<T>(n));
84+
}
85+
86+
state.SetItemsProcessed(state.iterations() * static_cast<std::int64_t>(n));
87+
}
88+
89+
// Use alias for shorted names.
90+
inline constexpr auto memory = fold_data_mode::memory;
91+
inline constexpr auto lazy = fold_data_mode::lazy;
92+
inline constexpr auto chunk_1 = fold_chunk_mode::explicit_one;
93+
inline constexpr auto chunk_deduced = fold_chunk_mode::deduced;
94+
inline constexpr auto chunk_fixed = fold_chunk_mode::fixed;
95+
inline constexpr auto sync_proj = fold_projection_mode::sync;
96+
inline constexpr auto async_proj = fold_projection_mode::async;
97+
98+
using int32 = std::int32_t;
99+
using float32 = float;
100+
101+
#define LF_FOLD_BENCH_SIZES_SMALL(bench_fn, category, name, ...) \
102+
BENCH_ONE(bench_fn, category, name, test, fold __VA_OPT__(, ) __VA_ARGS__) \
103+
BENCH_ONE(bench_fn, category, name, base, fold_1024 __VA_OPT__(, ) __VA_ARGS__) \
104+
BENCH_ONE(bench_fn, category, name, base, fold_1024_sq __VA_OPT__(, ) __VA_ARGS__) \
105+
106+
#define LF_FOLD_BENCH_SIZES(bench_fn, category, name, ...) \
107+
LF_FOLD_BENCH_SIZES_SMALL(bench_fn, category, name __VA_OPT__(, ) __VA_ARGS__) \
108+
BENCH_ONE(bench_fn, category, name, base, fold_1024_cu __VA_OPT__(, ) __VA_ARGS__)
109+
110+
#define LF_FOLD_BENCH_SIZES_MT(bench_fn, category, name, ...) \
111+
BENCH_ONE_MT(bench_fn, category, name, test, fold __VA_OPT__(, ) __VA_ARGS__) \
112+
BENCH_ONE_MT(bench_fn, category, name, base, fold_1024_cu __VA_OPT__(, ) __VA_ARGS__)

benchmark/src/libfork/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ add_library(libfork_benchmarks)
33
target_sources(libfork_benchmarks
44
PRIVATE
55
fib.cpp
6+
fold.cpp
67
uts.cpp
78
switch_io_pool.cpp
89
switch_random.cpp

benchmark/src/libfork/fold.cpp

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
#include <benchmark/benchmark.h>
2+
3+
#include "fold.hpp"
4+
5+
#include "helpers.hpp"
6+
7+
import std;
8+
9+
import libfork;
10+
11+
namespace {
12+
13+
template <typename T>
14+
struct sync_projection {
15+
static constexpr auto operator()(T value) -> fold_accum_t<T> { return static_cast<fold_accum_t<T>>(value); }
16+
};
17+
18+
template <typename T>
19+
struct async_projection {
20+
template <typename Context>
21+
static auto operator()(lf::env<Context>, T value) -> lf::task<fold_accum_t<T>, Context> {
22+
co_return static_cast<fold_accum_t<T>>(value);
23+
}
24+
};
25+
26+
template <fold_projection_mode Projection, typename T>
27+
constexpr auto make_projection() {
28+
if constexpr (Projection == fold_projection_mode::sync) {
29+
return sync_projection<T>{};
30+
} else {
31+
return async_projection<T>{};
32+
}
33+
}
34+
35+
template <fold_chunk_mode Chunk,
36+
fold_projection_mode Projection,
37+
typename T,
38+
lf::scheduler Sch,
39+
typename Range>
40+
auto run_fold(Sch &pool, Range &&range) -> fold_accum_t<T> {
41+
42+
auto projection = make_projection<Projection, T>();
43+
44+
if constexpr (Chunk == fold_chunk_mode::deduced) {
45+
auto result = lf::schedule(pool,
46+
lf::fold,
47+
std::ranges::begin(range),
48+
std::ranges::end(range),
49+
std::plus<>{},
50+
std::move(projection))
51+
.get();
52+
return *std::move(result);
53+
} else {
54+
using diff_t = std::ranges::range_difference_t<Range>;
55+
constexpr diff_t chunk = Chunk == fold_chunk_mode::explicit_one ? diff_t{1} : diff_t{4096};
56+
auto result = lf::schedule(pool,
57+
lf::fold,
58+
std::ranges::begin(range),
59+
std::ranges::end(range),
60+
chunk,
61+
std::plus<>{},
62+
std::move(projection))
63+
.get();
64+
return *std::move(result);
65+
}
66+
}
67+
68+
template <fold_data_mode Data, fold_chunk_mode Chunk, fold_projection_mode Projection, typename T>
69+
void run(benchmark::State &state) {
70+
71+
mono_busy_pool pool{1};
72+
73+
run_fold_input<Data, T>(state, [&](auto &&values) -> fold_accum_t<T> {
74+
return run_fold<Chunk, Projection, T>(pool, std::forward<decltype(values)>(values));
75+
});
76+
}
77+
78+
template <fold_data_mode Data,
79+
fold_chunk_mode Chunk,
80+
fold_projection_mode Projection,
81+
typename T,
82+
lf::scheduler Sch>
83+
void run_mt(benchmark::State &state) {
84+
85+
state.counters["p"] = static_cast<double>(thread_count<Sch>(state));
86+
state.SetComplexityN(static_cast<benchmark::IterationCount>(thread_count<Sch>(state)));
87+
88+
Sch pool = make_scheduler<Sch>(state);
89+
90+
run_fold_input<Data, T>(state, [&](auto &&values) -> fold_accum_t<T> {
91+
return run_fold<Chunk, Projection, T>(pool, std::forward<decltype(values)>(values));
92+
});
93+
}
94+
95+
} // namespace
96+
97+
// Chunked/sync/sync versions to mirror serial benchmarks.
98+
LF_FOLD_BENCH_SIZES(run, libfork, fold / std_plus, memory, chunk_fixed, sync_proj, int32)
99+
LF_FOLD_BENCH_SIZES(run, libfork, fold / std_plus, memory, chunk_fixed, sync_proj, float32)
100+
LF_FOLD_BENCH_SIZES(run, libfork, fold / std_plus, lazy, chunk_fixed, sync_proj, int32)
101+
LF_FOLD_BENCH_SIZES(run, libfork, fold / std_plus, lazy, chunk_fixed, sync_proj, float32)
102+
103+
// Compare specialised for sync/async (no largest size)
104+
LF_FOLD_BENCH_SIZES_SMALL(run, libfork, fold / std_plus, memory, chunk_1, sync_proj, float32)
105+
LF_FOLD_BENCH_SIZES_SMALL(run, libfork, fold / std_plus, memory, chunk_deduced, sync_proj, float32)
106+
LF_FOLD_BENCH_SIZES_SMALL(run, libfork, fold / std_plus, memory, chunk_1, async_proj, float32)
107+
LF_FOLD_BENCH_SIZES_SMALL(run, libfork, fold / std_plus, memory, chunk_deduced, async_proj, float32)
108+
109+
#define MT(...) LF_FOLD_BENCH_SIZES_MT(__VA_ARGS__)
110+
111+
// Multi-threaded float32/sync projection.
112+
MT(run_mt, libfork, fold / std_plus, memory, chunk_fixed, sync_proj, float32, mono_busy_pool)
113+
MT(run_mt, libfork, fold / std_plus, lazy, chunk_fixed, sync_proj, float32, mono_busy_pool)
114+
MT(run_mt, libfork, fold / std_plus, memory, chunk_fixed, sync_proj, float32, poly_busy_pool)
115+
MT(run_mt, libfork, fold / std_plus, lazy, chunk_fixed, sync_proj, float32, poly_busy_pool)
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
add_library(serial_benchmarks)
22

3-
target_sources(serial_benchmarks PRIVATE fib.cpp uts.cpp)
3+
target_sources(serial_benchmarks PRIVATE fib.cpp fold.cpp uts.cpp)
44

55
target_link_libraries(serial_benchmarks PUBLIC benchmark_common)

benchmark/src/serial/fold.cpp

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#include <benchmark/benchmark.h>
2+
3+
#include "fold.hpp"
4+
5+
import std;
6+
7+
namespace {
8+
9+
template <fold_data_mode Data, typename T>
10+
void fold_reduce(benchmark::State &state) {
11+
run_fold_input<Data, T>(state, [](auto &&values) -> fold_accum_t<T> {
12+
return std::reduce(
13+
std::ranges::begin(values), std::ranges::end(values), fold_accum_t<T>{}, [](auto a, auto b) static {
14+
return fold_accum_t<T>(a) + fold_accum_t<T>(b);
15+
});
16+
});
17+
}
18+
19+
} // namespace
20+
21+
#define LF_REGISTER_FOLD_REDUCE(data, dtype) \
22+
LF_FOLD_BENCH_SIZES(fold_reduce, serial, fold / std_reduce, data, dtype)
23+
24+
LF_REGISTER_FOLD_REDUCE(memory, int32)
25+
LF_REGISTER_FOLD_REDUCE(memory, float32)
26+
LF_REGISTER_FOLD_REDUCE(lazy, int32)
27+
LF_REGISTER_FOLD_REDUCE(lazy, float32)

src/algorithm/algorithm.cxx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,4 @@ export module libfork.algorithm;
22

33
export import :concepts;
44
export import :for_each;
5+
export import :fold;

src/algorithm/concepts.cxx

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,7 @@ namespace lf {
99
export template <typename T>
1010
concept sized_random_access_range = std::ranges::random_access_range<T> && std::ranges::sized_range<T>;
1111

12+
template <typename T>
13+
concept default_movable = std::default_initializable<T> && std::movable<T>;
14+
1215
} // namespace lf

0 commit comments

Comments
 (0)