ConorWilliams
diff --git a/‎benchmark/lib/CMakeLists.txt‎
Lines changed: 12 additions & 1 deletion b/‎benchmark/lib/CMakeLists.txt‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎benchmark/lib/bench.hpp‎
Lines changed: 62 additions & 0 deletions b/‎benchmark/lib/bench.hpp‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎benchmark/lib/fib.hpp‎
Lines changed: 22 additions & 0 deletions b/‎benchmark/lib/fib.hpp‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎benchmark/lib/fold.hpp‎
Lines changed: 15 additions & 16 deletions b/‎benchmark/lib/fold.hpp‎
Lines changed: 15 additions & 16 deletions
diff --git a/‎benchmark/lib/heat.hpp‎
Lines changed: 93 additions & 0 deletions b/‎benchmark/lib/heat.hpp‎
Lines changed: 93 additions & 0 deletions
diff --git a/‎benchmark/lib/integrate.hpp‎
Lines changed: 42 additions & 0 deletions b/‎benchmark/lib/integrate.hpp‎
Lines changed: 42 additions & 0 deletions
@@ -9,10 +9,21 @@ target_sources(benchmark_common
     FILE_SET HEADERS
     BASE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}
     FILES
+      bench.hpp
       fib.hpp
       fold.hpp
-      uts.hpp
+      heat.hpp
+      integrate.hpp
+      knapsack.hpp
       macros.hpp
+      mandelbrot.hpp
+      matmul.hpp
+      nqueens.hpp
+      primes.hpp
+      quicksort.hpp
+      scan.hpp
+      skynet.hpp
+      uts.hpp
 )
 
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../external/uts external/uts)
 
@@ -0,0 +1,62 @@
+#pragma once
+
+#include <benchmark/benchmark.h>
+
+#ifdef LF_BENCH_NO_IMPORT_STD
+  #include <cstdint>
+  #include <format>
+  #include <functional>
+#else
+import std;
+#endif
+
+namespace lf_bench {
+
+inline constexpr std::int64_t no_threads = 0;
+
+inline auto inverse_complexity(benchmark::IterationCount n) -> double { return 1.0 / static_cast<double>(n); }
+
+inline void report_threads(benchmark::State &state, std::int64_t threads) {
+  if (threads == no_threads) {
+    return;
+  }
+
+  state.counters["p"] = static_cast<double>(threads);
+  state.SetComplexityN(static_cast<benchmark::IterationCount>(threads));
+}
+
+// `bench` reports mismatches with a `std::format` call that formats both
+// `result` and `expected`, so `Expected` and `std::invoke_result_t<Fn>` must be
+// formattable.
+template <typename Expected, typename Check, typename Fn>
+void bench(benchmark::State &state, std::int64_t threads, const Expected &expected, Check check, Fn fn) {
+  report_threads(state, threads);
+
+  for (auto _ : state) {
+    auto result = std::invoke(fn);
+
+    if (!std::invoke(check, result, expected)) {
+      state.SkipWithError(std::format("incorrect result: {} != {}", result, expected));
+      break;
+    }
+
+    benchmark::DoNotOptimize(result);
+  }
+}
+
+template <typename Expected, typename Fn>
+void bench(benchmark::State &state, std::int64_t threads, const Expected &expected, Fn fn) {
+  bench(state, threads, expected, std::equal_to<>{}, fn);
+}
+
+template <typename Expected, typename Check, typename Fn>
+void bench(benchmark::State &state, const Expected &expected, Check check, Fn fn) {
+  bench(state, no_threads, expected, check, fn);
+}
+
+template <typename Expected, typename Fn>
+void bench(benchmark::State &state, const Expected &expected, Fn fn) {
+  bench(state, no_threads, expected, fn);
+}
+
+} // namespace lf_bench
@@ -1,7 +1,12 @@
 #pragma once
 
+#include <benchmark/benchmark.h>
+
+#include "bench.hpp"
+
 #ifdef LF_BENCH_NO_IMPORT_STD
   #include <cstdint>
+  #include <functional>
 #else
 import std;
 #endif
@@ -29,3 +34,20 @@ constexpr auto fib_ref(std::int64_t n) -> std::int64_t {
 
   return curr;
 }
+
+template <typename Fn>
+void run_fib(benchmark::State &state, std::int64_t threads, Fn fn) {
+  std::int64_t n = state.range(0);
+  std::int64_t expect = fib_ref(n);
+
+  state.counters["n"] = static_cast<double>(n);
+
+  lf_bench::bench(state, threads, expect, [n, fn]() -> std::int64_t {
+    return std::invoke(fn, n);
+  });
+}
+
+template <typename Fn>
+void run_fib(benchmark::State &state, Fn fn) {
+  run_fib(state, lf_bench::no_threads, fn);
+}
@@ -8,13 +8,11 @@
   #include <concepts>
   #include <cstddef>
   #include <cstdint>
-  #include <format>
   #include <functional>
   #include <new>
   #include <ranges>
   #include <span>
   #include <type_traits>
-  #include <utility>
   #include <vector>
 #else
 import std;
@@ -63,29 +61,30 @@ auto fold_result_is_correct(fold_accum_t<T> result, fold_accum_t<T> expect) -> b
 }
 
 template <fold_data_mode Data, typename T, typename Fn>
-void run_fold_input(benchmark::State &state, Fn &&fn) {
-
+void run_fold_input(benchmark::State &state, std::int64_t threads, Fn fn) {
   auto n = static_cast<std::size_t>(state.range(0));
   auto expect = expected_fold_result<T>(n);
 
-  auto bench = [&](auto range) -> void {
-    for (auto _ : state) {
-      if (auto result = std::invoke(fn, range); !fold_result_is_correct<T>(result, expect)) {
-        state.SkipWithError(std::format("incorrect result: {} != {}", result, expect));
-        break;
-      }
-    }
+  auto run = [&](auto const &range) -> void {
+    lf_bench::bench(state, threads, expect, fold_result_is_correct<T>, [&]() -> fold_accum_t<T> {
+      return std::invoke(fn, range);
+    });
   };
 
   if constexpr (Data == fold_data_mode::memory) {
-    bench(make_fold_range<T>(n) | std::ranges::to<std::vector<T>>());
+    run(make_fold_range<T>(n) | std::ranges::to<std::vector<T>>());
   } else {
-    bench(make_fold_range<T>(n));
+    run(make_fold_range<T>(n));
   }
 
   state.SetItemsProcessed(state.iterations() * static_cast<std::int64_t>(n));
 }
 
+template <fold_data_mode Data, typename T, typename Fn>
+void run_fold_input(benchmark::State &state, Fn fn) {
+  run_fold_input<Data, T>(state, lf_bench::no_threads, fn);
+}
+
 // Use alias for shorted names.
 inline constexpr auto memory = fold_data_mode::memory;
 inline constexpr auto lazy = fold_data_mode::lazy;
@@ -98,13 +97,13 @@ inline constexpr auto async_proj = fold_projection_mode::async;
 using int32 = std::int32_t;
 using float32 = float;
 
-#define LF_FOLD_BENCH_SIZES_SMALL(bench_fn, category, name, ...)                                                   \
+#define LF_FOLD_BENCH_SIZES_SMALL(bench_fn, category, name, ...)                                             \
   BENCH_ONE(bench_fn, category, name, test, fold __VA_OPT__(, ) __VA_ARGS__)                                 \
   BENCH_ONE(bench_fn, category, name, base, fold_1024 __VA_OPT__(, ) __VA_ARGS__)                            \
-  BENCH_ONE(bench_fn, category, name, base, fold_1024_sq __VA_OPT__(, ) __VA_ARGS__)                         \
+  BENCH_ONE(bench_fn, category, name, base, fold_1024_sq __VA_OPT__(, ) __VA_ARGS__)
 
 #define LF_FOLD_BENCH_SIZES(bench_fn, category, name, ...)                                                   \
-  LF_FOLD_BENCH_SIZES_SMALL(bench_fn, category, name __VA_OPT__(, ) __VA_ARGS__)                                 \
+  LF_FOLD_BENCH_SIZES_SMALL(bench_fn, category, name __VA_OPT__(, ) __VA_ARGS__)                             \
   BENCH_ONE(bench_fn, category, name, base, fold_1024_cu __VA_OPT__(, ) __VA_ARGS__)
 
 #define LF_FOLD_BENCH_SIZES_MT(bench_fn, category, name, ...)                                                \
 
@@ -0,0 +1,93 @@
+#pragma once
+
+#include "bench.hpp"
+
+#ifdef LF_BENCH_NO_IMPORT_STD
+  #include <cmath>
+  #include <cstddef>
+  #include <functional>
+  #include <utility>
+  #include <vector>
+#else
+import std;
+#endif
+
+inline constexpr std::size_t heat_test = 64;
+inline constexpr std::size_t heat_base = 1024;
+
+inline constexpr std::size_t heat_iters = 16;
+
+// Initialise grid with a fixed analytic profile (boundaries clamped).
+inline auto heat_make_grid(std::size_t n) -> std::vector<double> {
+  std::vector<double> g(n * n);
+  for (std::size_t y = 0; y < n; ++y) {
+    for (std::size_t x = 0; x < n; ++x) {
+      double dx = static_cast<double>(x) / static_cast<double>(n - 1) - 0.5;
+      double dy = static_cast<double>(y) / static_cast<double>(n - 1) - 0.5;
+      g[y * n + x] = std::exp(-8.0 * (dx * dx + dy * dy));
+    }
+  }
+  return g;
+}
+
+inline auto heat_matches(std::vector<double> const &actual, std::vector<double> const &expected) -> bool {
+  for (std::size_t i = 0; i < actual.size(); ++i) {
+    if (std::abs(actual[i] - expected[i]) > 1e-12) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline void heat_jacobi_step(double const *src, double *dst, std::size_t n) {
+  for (std::size_t y = 1; y < n - 1; ++y) {
+    for (std::size_t x = 1; x < n - 1; ++x) {
+      std::size_t i = y * n + x;
+      dst[i] = 0.25 * (src[i - 1] + src[i + 1] + src[i - n] + src[i + n]);
+    }
+  }
+  for (std::size_t x = 0; x < n; ++x) {
+    dst[x] = src[x];
+    dst[(n - 1) * n + x] = src[(n - 1) * n + x];
+  }
+  for (std::size_t y = 0; y < n; ++y) {
+    dst[y * n] = src[y * n];
+    dst[y * n + (n - 1)] = src[y * n + (n - 1)];
+  }
+}
+
+inline auto
+heat_reference(std::vector<double> initial, std::size_t n, std::size_t iters) -> std::vector<double> {
+  std::vector<double> scratch(initial.size());
+  double *src = initial.data();
+  double *dst = scratch.data();
+
+  for (std::size_t t = 0; t < iters; ++t) {
+    heat_jacobi_step(src, dst, n);
+    std::swap(src, dst);
+  }
+
+  if (src == initial.data()) {
+    return initial;
+  }
+  return scratch;
+}
+
+template <typename Fn>
+void run_heat(benchmark::State &state, Fn fn) {
+  auto n = static_cast<std::size_t>(state.range(0));
+  state.counters["n"] = static_cast<double>(n);
+  state.counters["iters"] = static_cast<double>(heat_iters);
+
+  std::vector<double> initial = heat_make_grid(n);
+  std::vector<double> a(initial.size());
+  std::vector<double> b(initial.size());
+  std::vector<double> reference = heat_reference(initial, n, heat_iters);
+
+  lf_bench::bench(state, true, [&]() -> bool {
+    a = initial;
+    std::invoke(fn, a.data(), b.data(), n, heat_iters);
+    benchmark::DoNotOptimize(a.data());
+    return heat_matches((heat_iters % 2 == 0) ? a : b, reference);
+  });
+}
@@ -0,0 +1,42 @@
+#pragma once
+
+#include "bench.hpp"
+
+#ifdef LF_BENCH_NO_IMPORT_STD
+  #include <cmath>
+  #include <cstdint>
+  #include <functional>
+#else
+import std;
+#endif
+
+inline constexpr std::int64_t integrate_test = 100;
+inline constexpr std::int64_t integrate_base = 10'000;
+
+inline constexpr double integrate_epsilon = 1.0e-9;
+
+inline constexpr auto integrate_fn(double x) -> double { return (x * x + 1.0) * x; }
+
+inline constexpr auto integrate_exact(double a, double b) -> double {
+  auto indefinite = [](double x) {
+    return 0.25 * x * x * (x * x + 2);
+  };
+  return indefinite(b) - indefinite(a);
+}
+
+inline auto integrate_is_close(double result, double expect) -> bool {
+  return std::abs(result - expect) <= 1e-3 * std::abs(expect);
+}
+
+template <typename Fn>
+void run_integrate(benchmark::State &state, Fn fn) {
+  std::int64_t n = state.range(0);
+  double upper = static_cast<double>(n);
+  double expect = integrate_exact(0, upper);
+
+  state.counters["n"] = static_cast<double>(n);
+
+  lf_bench::bench(state, expect, integrate_is_close, [upper, fn]() -> double {
+    return std::invoke(fn, upper);
+  });
+}