[V4] Fold algorithm (#107)

ConorWilliams · web-flow · commit 46bae8a8f147 · 2026-05-10T16:58:40.000+01:00
* TMP benchmarks * fold * Revert "TMP benchmarks" This reverts commit 90b94e3. * benchmarks * format * spell * pass 2 * dedupe * use macro * no namespace * register benchmarks individually * dry run * Revert "dry run" This reverts commit 2eedb57. * Revert "register benchmarks individually" This reverts commit 4f4f19a. * drop gb test * drop counters * clean ups * drop alias * use range * tmp * powers of KiB * diff mode * simplify * more simplification * items only * renames * new style tmp2 * complete new style * special path for n=1 * optimize * decltype * special case handling * clean ups * todo * drop unneeded * serial does type promotion * less benchmarks * multithreaded benchmarks * shorter name * omit slow benchmarks * drop small reduction benchmark * use invoke * extra tests * init explicit * optional API * drop duplicates * exception support * tidy up * more clean ups * add default_movable constraint * fix test
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -106,6 +106,7 @@ target_sources(libfork_libfork
       # libfork.algorithm
       src/algorithm/algorithm.cxx
       src/algorithm/for_each.cxx
+      src/algorithm/fold.cxx
       src/algorithm/concepts.cxx
     PRIVATE
       src/exception.cpp
diff --git a/benchmark/lib/CMakeLists.txt b/benchmark/lib/CMakeLists.txt
@@ -10,6 +10,7 @@ target_sources(benchmark_common
     BASE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}
     FILES
       fib.hpp
+      fold.hpp
       uts.hpp
       macros.hpp
 )
diff --git a/benchmark/lib/fold.hpp b/benchmark/lib/fold.hpp
@@ -0,0 +1,112 @@
+#pragma once
+
+#include <benchmark/benchmark.h>
+
+#include "macros.hpp"
+
+#ifdef LF_BENCH_NO_IMPORT_STD
+  #include <concepts>
+  #include <cstddef>
+  #include <cstdint>
+  #include <format>
+  #include <functional>
+  #include <new>
+  #include <ranges>
+  #include <span>
+  #include <type_traits>
+  #include <utility>
+  #include <vector>
+#else
+import std;
+#endif
+
+inline constexpr std::int64_t fold_test = 10;
+
+inline constexpr std::int64_t fold_1024 = 1'024;
+inline constexpr std::int64_t fold_1024_base = fold_1024;
+inline constexpr std::int64_t fold_1024_sq_base = fold_1024 * fold_1024;
+inline constexpr std::int64_t fold_1024_cu_base = fold_1024 * fold_1024 * fold_1024;
+
+enum class fold_data_mode : char { memory, lazy };
+enum class fold_chunk_mode : char { explicit_one, deduced, fixed };
+enum class fold_projection_mode : char { sync, async };
+
+template <typename T>
+constexpr auto fold_value(std::size_t index) -> T {
+  return static_cast<T>(index % 4UZ);
+}
+
+template <typename T>
+constexpr auto make_fold_range(std::size_t count) {
+  return std::views::iota(std::size_t{}, count) | std::views::transform([](std::size_t index) -> T {
+           return fold_value<T>(index);
+         });
+}
+
+template <typename T>
+using fold_accum_t = std::conditional_t<std::same_as<T, float>, double, std::int64_t>;
+
+template <typename T>
+constexpr auto expected_fold_result(std::size_t count) -> fold_accum_t<T> {
+  auto groups = count / 4UZ;
+  auto remainder = count % 4UZ;
+  return static_cast<fold_accum_t<T>>((groups * 6UZ) + ((remainder * (remainder - 1UZ)) / 2UZ));
+}
+
+template <typename T>
+auto fold_result_is_correct(fold_accum_t<T> result, fold_accum_t<T> expect) -> bool {
+  if constexpr (std::floating_point<fold_accum_t<T>>) {
+    return std::abs(result - expect) <= 1e-6;
+  } else {
+    return result == expect;
+  }
+}
+
+template <fold_data_mode Data, typename T, typename Fn>
+void run_fold_input(benchmark::State &state, Fn &&fn) {
+
+  auto n = static_cast<std::size_t>(state.range(0));
+  auto expect = expected_fold_result<T>(n);
+
+  auto bench = [&](auto range) -> void {
+    for (auto _ : state) {
+      if (auto result = std::invoke(fn, range); !fold_result_is_correct<T>(result, expect)) {
+        state.SkipWithError(std::format("incorrect result: {} != {}", result, expect));
+        break;
+      }
+    }
+  };
+
+  if constexpr (Data == fold_data_mode::memory) {
+    bench(make_fold_range<T>(n) | std::ranges::to<std::vector<T>>());
+  } else {
+    bench(make_fold_range<T>(n));
+  }
+
+  state.SetItemsProcessed(state.iterations() * static_cast<std::int64_t>(n));
+}
+
+// Use alias for shorted names.
+inline constexpr auto memory = fold_data_mode::memory;
+inline constexpr auto lazy = fold_data_mode::lazy;
+inline constexpr auto chunk_1 = fold_chunk_mode::explicit_one;
+inline constexpr auto chunk_deduced = fold_chunk_mode::deduced;
+inline constexpr auto chunk_fixed = fold_chunk_mode::fixed;
+inline constexpr auto sync_proj = fold_projection_mode::sync;
+inline constexpr auto async_proj = fold_projection_mode::async;
+
+using int32 = std::int32_t;
+using float32 = float;
+
+#define LF_FOLD_BENCH_SIZES_SMALL(bench_fn, category, name, ...)                                                   \
+  BENCH_ONE(bench_fn, category, name, test, fold __VA_OPT__(, ) __VA_ARGS__)                                 \
+  BENCH_ONE(bench_fn, category, name, base, fold_1024 __VA_OPT__(, ) __VA_ARGS__)                            \
+  BENCH_ONE(bench_fn, category, name, base, fold_1024_sq __VA_OPT__(, ) __VA_ARGS__)                         \
+
+#define LF_FOLD_BENCH_SIZES(bench_fn, category, name, ...)                                                   \
+  LF_FOLD_BENCH_SIZES_SMALL(bench_fn, category, name __VA_OPT__(, ) __VA_ARGS__)                                 \
+  BENCH_ONE(bench_fn, category, name, base, fold_1024_cu __VA_OPT__(, ) __VA_ARGS__)
+
+#define LF_FOLD_BENCH_SIZES_MT(bench_fn, category, name, ...)                                                \
+  BENCH_ONE_MT(bench_fn, category, name, test, fold __VA_OPT__(, ) __VA_ARGS__)                              \
+  BENCH_ONE_MT(bench_fn, category, name, base, fold_1024_cu __VA_OPT__(, ) __VA_ARGS__)
diff --git a/benchmark/src/libfork/CMakeLists.txt b/benchmark/src/libfork/CMakeLists.txt
@@ -3,6 +3,7 @@ add_library(libfork_benchmarks)
 target_sources(libfork_benchmarks
   PRIVATE
     fib.cpp
+    fold.cpp
     uts.cpp
     switch_io_pool.cpp
     switch_random.cpp
diff --git a/benchmark/src/libfork/fold.cpp b/benchmark/src/libfork/fold.cpp
@@ -0,0 +1,115 @@
+#include <benchmark/benchmark.h>
+
+#include "fold.hpp"
+
+#include "helpers.hpp"
+
+import std;
+
+import libfork;
+
+namespace {
+
+template <typename T>
+struct sync_projection {
+  static constexpr auto operator()(T value) -> fold_accum_t<T> { return static_cast<fold_accum_t<T>>(value); }
+};
+
+template <typename T>
+struct async_projection {
+  template <typename Context>
+  static auto operator()(lf::env<Context>, T value) -> lf::task<fold_accum_t<T>, Context> {
+    co_return static_cast<fold_accum_t<T>>(value);
+  }
+};
+
+template <fold_projection_mode Projection, typename T>
+constexpr auto make_projection() {
+  if constexpr (Projection == fold_projection_mode::sync) {
+    return sync_projection<T>{};
+  } else {
+    return async_projection<T>{};
+  }
+}
+
+template <fold_chunk_mode Chunk,
+          fold_projection_mode Projection,
+          typename T,
+          lf::scheduler Sch,
+          typename Range>
+auto run_fold(Sch &pool, Range &&range) -> fold_accum_t<T> {
+
+  auto projection = make_projection<Projection, T>();
+
+  if constexpr (Chunk == fold_chunk_mode::deduced) {
+    auto result = lf::schedule(pool,
+                               lf::fold,
+                               std::ranges::begin(range),
+                               std::ranges::end(range),
+                               std::plus<>{},
+                               std::move(projection))
+                      .get();
+    return *std::move(result);
+  } else {
+    using diff_t = std::ranges::range_difference_t<Range>;
+    constexpr diff_t chunk = Chunk == fold_chunk_mode::explicit_one ? diff_t{1} : diff_t{4096};
+    auto result = lf::schedule(pool,
+                               lf::fold,
+                               std::ranges::begin(range),
+                               std::ranges::end(range),
+                               chunk,
+                               std::plus<>{},
+                               std::move(projection))
+                      .get();
+    return *std::move(result);
+  }
+}
+
+template <fold_data_mode Data, fold_chunk_mode Chunk, fold_projection_mode Projection, typename T>
+void run(benchmark::State &state) {
+
+  mono_busy_pool pool{1};
+
+  run_fold_input<Data, T>(state, [&](auto &&values) -> fold_accum_t<T> {
+    return run_fold<Chunk, Projection, T>(pool, std::forward<decltype(values)>(values));
+  });
+}
+
+template <fold_data_mode Data,
+          fold_chunk_mode Chunk,
+          fold_projection_mode Projection,
+          typename T,
+          lf::scheduler Sch>
+void run_mt(benchmark::State &state) {
+
+  state.counters["p"] = static_cast<double>(thread_count<Sch>(state));
+  state.SetComplexityN(static_cast<benchmark::IterationCount>(thread_count<Sch>(state)));
+
+  Sch pool = make_scheduler<Sch>(state);
+
+  run_fold_input<Data, T>(state, [&](auto &&values) -> fold_accum_t<T> {
+    return run_fold<Chunk, Projection, T>(pool, std::forward<decltype(values)>(values));
+  });
+}
+
+} // namespace
+
+// Chunked/sync/sync versions to mirror serial benchmarks.
+LF_FOLD_BENCH_SIZES(run, libfork, fold / std_plus, memory, chunk_fixed, sync_proj, int32)
+LF_FOLD_BENCH_SIZES(run, libfork, fold / std_plus, memory, chunk_fixed, sync_proj, float32)
+LF_FOLD_BENCH_SIZES(run, libfork, fold / std_plus, lazy, chunk_fixed, sync_proj, int32)
+LF_FOLD_BENCH_SIZES(run, libfork, fold / std_plus, lazy, chunk_fixed, sync_proj, float32)
+
+// Compare specialised for sync/async (no largest size)
+LF_FOLD_BENCH_SIZES_SMALL(run, libfork, fold / std_plus, memory, chunk_1, sync_proj, float32)
+LF_FOLD_BENCH_SIZES_SMALL(run, libfork, fold / std_plus, memory, chunk_deduced, sync_proj, float32)
+LF_FOLD_BENCH_SIZES_SMALL(run, libfork, fold / std_plus, memory, chunk_1, async_proj, float32)
+LF_FOLD_BENCH_SIZES_SMALL(run, libfork, fold / std_plus, memory, chunk_deduced, async_proj, float32)
+
+#define MT(...) LF_FOLD_BENCH_SIZES_MT(__VA_ARGS__)
+
+// Multi-threaded float32/sync projection.
+MT(run_mt, libfork, fold / std_plus, memory, chunk_fixed, sync_proj, float32, mono_busy_pool)
+MT(run_mt, libfork, fold / std_plus, lazy, chunk_fixed, sync_proj, float32, mono_busy_pool)
+MT(run_mt, libfork, fold / std_plus, memory, chunk_fixed, sync_proj, float32, poly_busy_pool)
+MT(run_mt, libfork, fold / std_plus, lazy, chunk_fixed, sync_proj, float32, poly_busy_pool)
diff --git a/benchmark/src/serial/CMakeLists.txt b/benchmark/src/serial/CMakeLists.txt
@@ -1,5 +1,5 @@
 add_library(serial_benchmarks)
 
-target_sources(serial_benchmarks PRIVATE fib.cpp uts.cpp)
+target_sources(serial_benchmarks PRIVATE fib.cpp fold.cpp uts.cpp)
 
 target_link_libraries(serial_benchmarks PUBLIC benchmark_common)
diff --git a/benchmark/src/serial/fold.cpp b/benchmark/src/serial/fold.cpp
@@ -0,0 +1,27 @@
+#include <benchmark/benchmark.h>
+
+#include "fold.hpp"
+
+import std;
+
+namespace {
+
+template <fold_data_mode Data, typename T>
+void fold_reduce(benchmark::State &state) {
+  run_fold_input<Data, T>(state, [](auto &&values) -> fold_accum_t<T> {
+    return std::reduce(
+        std::ranges::begin(values), std::ranges::end(values), fold_accum_t<T>{}, [](auto a, auto b) static {
+          return fold_accum_t<T>(a) + fold_accum_t<T>(b);
+        });
+  });
+}
+
+} // namespace
+
+#define LF_REGISTER_FOLD_REDUCE(data, dtype)                                                                 \
+  LF_FOLD_BENCH_SIZES(fold_reduce, serial, fold / std_reduce, data, dtype)
+
+LF_REGISTER_FOLD_REDUCE(memory, int32)
+LF_REGISTER_FOLD_REDUCE(memory, float32)
+LF_REGISTER_FOLD_REDUCE(lazy, int32)
+LF_REGISTER_FOLD_REDUCE(lazy, float32)
diff --git a/src/algorithm/algorithm.cxx b/src/algorithm/algorithm.cxx
@@ -2,3 +2,4 @@ export module libfork.algorithm;
 
 export import :concepts;
 export import :for_each;
+export import :fold;
diff --git a/src/algorithm/concepts.cxx b/src/algorithm/concepts.cxx
@@ -9,4 +9,7 @@ namespace lf {
 export template <typename T>
 concept sized_random_access_range = std::ranges::random_access_range<T> && std::ranges::sized_range<T>;
 
+template <typename T>
+concept default_movable = std::default_initializable<T> && std::movable<T>;
+
 } // namespace lf
diff --git a/src/algorithm/fold.cxx b/src/algorithm/fold.cxx
diff --git a/src/core/concepts/semigroup.cxx b/src/core/concepts/semigroup.cxx
diff --git a/test/src/concepts.cpp b/test/src/concepts.cpp
diff --git a/test/src/fold.cpp b/test/src/fold.cpp

Original file line number	Diff line number	Diff line change
`@@ -10,6 +10,7 @@ target_sources(benchmark_common`
`10`	`10`	`BASE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}`
`11`	`11`	`FILES`
`12`	`12`	`fib.hpp`
	`13`	`+ fold.hpp`
`13`	`14`	`uts.hpp`
`14`	`15`	`macros.hpp`
`15`	`16`	`)`
Original file line number	Diff line number	Diff line change
`@@ -2,3 +2,4 @@ export module libfork.algorithm;`
`2`	`2`
`3`	`3`	`export import :concepts;`
`4`	`4`	`export import :for_each;`
	`5`	`+export import :fold;`