feat: merge-train/barretenberg (#22888)

AztecBot · web-flow · commit 0a2a38c63d80 · 2026-05-01T01:33:52.000Z
BEGIN_COMMIT_OVERRIDE chore: chunk scalars in pip to distribute work evenly (#22627) END_COMMIT_OVERRIDE
diff --git a/barretenberg/cpp/src/barretenberg/benchmark/pippenger_bench/thread_scaling.bench.cpp b/barretenberg/cpp/src/barretenberg/benchmark/pippenger_bench/thread_scaling.bench.cpp
@@ -0,0 +1,139 @@
+/**
+ * @brief Pippenger thread-scaling benchmark for heterogeneous scalar distributions.
+ *
+ * MSM::batch_multi_scalar_mul partitions work across threads by cumulative per-scalar
+ * weight (see get_work_units in scalar_multiplication.cpp), where each scalar's weight
+ * is ceil(bit_length / bits_per_slice) -- i.e. the number of nonzero c-bit slices it
+ * contributes to bucket accumulation. Small scalars weigh less because their high-order
+ * slices are zero and get filtered by the zero-bucket pre-sort. This benchmark exercises
+ * pathological and typical bit-size distributions to verify thread scaling stays uniform.
+ *
+ * Distributions contrasted here:
+ *   - Clustered:    first half small (32-bit), second half full random -- stresses the
+ *                   weighted split; count-based partitioning would give half the threads
+ *                   ~all of the heavy work.
+ *   - UniformMixed: small/full randomly interleaved -- isolates heterogeneity alone.
+ *   - AllFull:      all full random (z_perm-like baseline).
+ *
+ * Expected: all three scale comparably under the weighted partition.
+ */
+#include "barretenberg/common/thread.hpp"
+#include "barretenberg/ecc/curves/bn254/bn254.hpp"
+#include "barretenberg/ecc/scalar_multiplication/scalar_multiplication.hpp"
+#include "barretenberg/numeric/random/engine.hpp"
+#include "barretenberg/srs/global_crs.hpp"
+
+#include <benchmark/benchmark.h>
+
+#include "barretenberg/common/google_bb_bench.hpp"
+
+using namespace benchmark;
+
+using Curve = bb::curve::BN254;
+using Fr = Curve::ScalarField;
+using G1 = Curve::AffineElement;
+
+namespace {
+
+constexpr size_t MSM_SIZE = 1 << 20;
+
+enum class Distribution { Clustered, UniformMixed, AllFull };
+
+class ThreadScalingBench : public benchmark::Fixture {
+  public:
+    std::shared_ptr<bb::srs::factories::Crs<Curve>> srs;
+    bb::numeric::RNG& engine = bb::numeric::get_debug_randomness();
+
+    void SetUp([[maybe_unused]] const ::benchmark::State& state) override
+    {
+        if (srs) {
+            return;
+        }
+        bb::srs::init_file_crs_factory(bb::srs::bb_crs_path());
+        srs = bb::srs::get_crs_factory<Curve>()->get_crs(MSM_SIZE);
+    }
+
+    // 32-bit "small" value -- mimics witness indices, booleans, limbs.
+    // On BN254 (254-bit field) with ~14 bits per Pippenger slice, only the lowest
+    // ~2-3 rounds produce nonzero slices for these scalars; the rest get filtered.
+    Fr small_scalar() { return Fr(static_cast<uint64_t>(engine.get_random_uint32())); }
+    Fr full_scalar() { return Fr::random_element(&engine); }
+
+    std::vector<Fr> build_scalars(Distribution dist)
+    {
+        std::vector<Fr> scalars(MSM_SIZE);
+        switch (dist) {
+        case Distribution::Clustered:
+            for (size_t i = 0; i < MSM_SIZE / 2; ++i) {
+                scalars[i] = small_scalar();
+            }
+            for (size_t i = MSM_SIZE / 2; i < MSM_SIZE; ++i) {
+                scalars[i] = full_scalar();
+            }
+            break;
+        case Distribution::UniformMixed:
+            for (size_t i = 0; i < MSM_SIZE; ++i) {
+                scalars[i] = (engine.get_random_uint32() & 1U) ? small_scalar() : full_scalar();
+            }
+            break;
+        case Distribution::AllFull:
+            for (size_t i = 0; i < MSM_SIZE; ++i) {
+                scalars[i] = full_scalar();
+            }
+            break;
+        }
+        return scalars;
+    }
+};
+
+static void run_msm(ThreadScalingBench& fx, benchmark::State& state, Distribution dist)
+{
+    const size_t num_threads = static_cast<size_t>(state.range(0));
+
+    // Rebuild per-invocation of the bench is fine: scalars get mutated (Montgomery
+    // round-trip) inside batch_multi_scalar_mul, and we want consistent input across iterations.
+    std::vector<Fr> scalars = fx.build_scalars(dist);
+
+    std::vector<std::span<Fr>> scalar_spans;
+    std::vector<std::span<const G1>> point_spans;
+    scalar_spans.emplace_back(scalars);
+    point_spans.emplace_back(fx.srs->get_monomial_points().subspan(0, MSM_SIZE));
+
+    const size_t original_concurrency = bb::get_num_cpus();
+    bb::set_parallel_for_concurrency(num_threads);
+
+    for (auto _ : state) {
+        GOOGLE_BB_BENCH_REPORTER(state);
+        bb::scalar_multiplication::MSM<Curve>::batch_multi_scalar_mul(point_spans, scalar_spans, false);
+    }
+
+    bb::set_parallel_for_concurrency(original_concurrency);
+}
+
+BENCHMARK_DEFINE_F(ThreadScalingBench, Clustered)(benchmark::State& state)
+{
+    run_msm(*this, state, Distribution::Clustered);
+}
+BENCHMARK_DEFINE_F(ThreadScalingBench, UniformMixed)(benchmark::State& state)
+{
+    run_msm(*this, state, Distribution::UniformMixed);
+}
+BENCHMARK_DEFINE_F(ThreadScalingBench, AllFull)(benchmark::State& state)
+{
+    run_msm(*this, state, Distribution::AllFull);
+}
+
+static void ThreadSweep(benchmark::internal::Benchmark* b)
+{
+    for (int64_t t : { 1, 2, 4, 8 }) {
+        b->Arg(t);
+    }
+}
+
+BENCHMARK_REGISTER_F(ThreadScalingBench, Clustered)->Unit(benchmark::kMillisecond)->Apply(ThreadSweep);
+BENCHMARK_REGISTER_F(ThreadScalingBench, UniformMixed)->Unit(benchmark::kMillisecond)->Apply(ThreadSweep);
+BENCHMARK_REGISTER_F(ThreadScalingBench, AllFull)->Unit(benchmark::kMillisecond)->Apply(ThreadSweep);
+
+} // namespace
+
+BENCHMARK_MAIN();
diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.cpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.cpp
@@ -85,29 +85,116 @@ void MSM<Curve>::transform_scalar_and_get_nonzero_scalar_indices(std::span<typen
     });
 }
 
+template <typename Curve>
+void MSM<Curve>::compute_scalar_slice_weights(std::span<const typename Curve::ScalarField> scalars,
+                                              std::span<const uint32_t> nonzero_indices,
+                                              uint32_t bits_per_slice,
+                                              std::vector<uint16_t>& weights) noexcept
+{
+    // weight = ceil(bit_length / bps) + FIXED_PER_SCALAR_WEIGHT. The fixed term approximates the
+    // O(num_rounds) per-scalar overhead in build_schedule, sort_schedule, and reduce_buckets that
+    // doesn't scale with bit_length. Without it, threads assigned many lightweight scalars end up
+    // with disproportionate build/sort/reduce work (empirically observed via per-phase profiling).
+    // Max is ceil(NUM_BITS_IN_FIELD / 1) + FIXED.
+    static constexpr uint16_t FIXED_PER_SCALAR_WEIGHT = 4;
+    static_assert(NUM_BITS_IN_FIELD + FIXED_PER_SCALAR_WEIGHT <= std::numeric_limits<uint16_t>::max(),
+                  "slice-count weight overflows uint16_t");
+    BB_ASSERT_GT(bits_per_slice, 0U);
+
+    const size_t n = nonzero_indices.size();
+    weights.resize(n);
+
+    parallel_for([&](const ThreadChunk& chunk) {
+        for (size_t k : chunk.range(n)) {
+            const auto& scalar = scalars[nonzero_indices[k]];
+            // Scalars were filtered for nonzero and are in non-Montgomery form, so get_msb()
+            // returns a valid bit index in [0, NUM_BITS_IN_FIELD).
+            const uint64_t msb = uint256_t{ scalar.data[0], scalar.data[1], scalar.data[2], scalar.data[3] }.get_msb();
+            const size_t bit_length = static_cast<size_t>(msb) + 1;
+            weights[k] =
+                static_cast<uint16_t>((bit_length + bits_per_slice - 1) / bits_per_slice) + FIXED_PER_SCALAR_WEIGHT;
+        }
+    });
+}
+
+template <typename Curve>
+std::vector<typename MSM<Curve>::ThreadWorkUnits> MSM<Curve>::partition_by_weight(
+    std::span<const std::vector<uint16_t>> msm_scalar_weights, size_t num_threads) noexcept
+{
+    BB_ASSERT_GT(num_threads, 0U);
+    std::vector<ThreadWorkUnits> work_units(num_threads);
+
+    size_t grand_total_weight = 0;
+    for (const auto& weights : msm_scalar_weights) {
+        for (uint16_t w : weights) {
+            grand_total_weight += w;
+        }
+    }
+    if (grand_total_weight == 0) {
+        return work_units;
+    }
+
+    const size_t weight_per_thread = numeric::ceil_div(grand_total_weight, num_threads);
+
+    size_t thread_accumulated_weight = 0;
+    size_t current_thread_idx = 0;
+    for (size_t i = 0; i < msm_scalar_weights.size(); ++i) {
+        const auto& weights = msm_scalar_weights[i];
+        const size_t n = weights.size();
+
+        size_t start = 0;
+        for (size_t k = 0; k < n; ++k) {
+            thread_accumulated_weight += weights[k];
+
+            if (current_thread_idx < num_threads - 1 && thread_accumulated_weight >= weight_per_thread) {
+                work_units[current_thread_idx].push_back(MSMWorkUnit{
+                    .batch_msm_index = i,
+                    .start_index = start,
+                    .size = k + 1 - start,
+                });
+                start = k + 1;
+                current_thread_idx++;
+                thread_accumulated_weight = 0;
+            }
+        }
+        if (start < n) {
+            work_units[current_thread_idx].push_back(MSMWorkUnit{
+                .batch_msm_index = i,
+                .start_index = start,
+                .size = n - start,
+            });
+        }
+    }
+    return work_units;
+}
+
 template <typename Curve>
 std::vector<typename MSM<Curve>::ThreadWorkUnits> MSM<Curve>::get_work_units(
     std::span<std::span<ScalarField>> scalars, std::vector<std::vector<uint32_t>>& msm_scalar_indices) noexcept
 {
     const size_t num_msms = scalars.size();
     msm_scalar_indices.resize(num_msms);
-    for (size_t i = 0; i < num_msms; ++i) {
-        transform_scalar_and_get_nonzero_scalar_indices(scalars[i], msm_scalar_indices[i]);
-    }
 
+    // Weight scalars by their Pippenger cost (slice count + fixed overhead, see
+    // compute_scalar_slice_weights) to improve thread balancing.
+    std::vector<std::vector<uint16_t>> msm_scalar_weights(num_msms);
     size_t total_work = 0;
-    for (const auto& indices : msm_scalar_indices) {
-        total_work += indices.size();
+    for (size_t i = 0; i < num_msms; ++i) {
+        transform_scalar_and_get_nonzero_scalar_indices(scalars[i], msm_scalar_indices[i]);
+        const size_t n = msm_scalar_indices[i].size();
+        total_work += n;
+        if (n == 0) {
+            continue;
+        }
+        const uint32_t bps = get_optimal_log_num_buckets(n);
+        compute_scalar_slice_weights(scalars[i], msm_scalar_indices[i], bps, msm_scalar_weights[i]);
     }
 
     const size_t num_threads = get_num_cpus();
-    std::vector<ThreadWorkUnits> work_units(num_threads);
-
-    const size_t work_per_thread = numeric::ceil_div(total_work, num_threads);
-    const size_t work_of_last_thread = total_work - (work_per_thread * (num_threads - 1));
 
     // Only use a single work unit if we don't have enough work for every thread
     if (num_threads > total_work) {
+        std::vector<ThreadWorkUnits> work_units(num_threads);
         for (size_t i = 0; i < num_msms; ++i) {
             work_units[0].push_back(MSMWorkUnit{
                 .batch_msm_index = i,
@@ -118,37 +205,7 @@ std::vector<typename MSM<Curve>::ThreadWorkUnits> MSM<Curve>::get_work_units(
         return work_units;
     }
 
-    size_t thread_accumulated_work = 0;
-    size_t current_thread_idx = 0;
-    for (size_t i = 0; i < num_msms; ++i) {
-        size_t msm_work_remaining = msm_scalar_indices[i].size();
-        const size_t initial_msm_work = msm_work_remaining;
-
-        while (msm_work_remaining > 0) {
-            BB_ASSERT_LT(current_thread_idx, work_units.size());
-
-            const size_t total_thread_work =
-                (current_thread_idx == num_threads - 1) ? work_of_last_thread : work_per_thread;
-            const size_t available_thread_work = total_thread_work - thread_accumulated_work;
-            const size_t work_to_assign = std::min(available_thread_work, msm_work_remaining);
-
-            work_units[current_thread_idx].push_back(MSMWorkUnit{
-                .batch_msm_index = i,
-                .start_index = initial_msm_work - msm_work_remaining,
-                .size = work_to_assign,
-            });
-
-            thread_accumulated_work += work_to_assign;
-            msm_work_remaining -= work_to_assign;
-
-            // Move to next thread if current thread is full
-            if (thread_accumulated_work >= total_thread_work) {
-                current_thread_idx++;
-                thread_accumulated_work = 0;
-            }
-        }
-    }
-    return work_units;
+    return partition_by_weight(msm_scalar_weights, num_threads);
 }
 
 /**
diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.hpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.hpp
@@ -240,6 +240,14 @@ template <typename Curve> class MSM {
     /** @brief Compute optimal bits per slice by minimizing cost over c in [1, MAX_SLICE_BITS) */
     static uint32_t get_optimal_log_num_buckets(size_t num_points) noexcept;
 
+    /** @brief Partition per-MSM scalar weights into num_threads work units of approximately
+     *         equal cumulative weight.
+     *  @details Curve-independent and side-effect-free. The walk closes a work unit every time
+     *           the running weight crosses the per-thread target, except on the last thread
+     *           which absorbs any remainder so rounding drift doesn't leave work stranded. */
+    static std::vector<ThreadWorkUnits> partition_by_weight(std::span<const std::vector<uint16_t>> msm_scalar_weights,
+                                                            size_t num_threads) noexcept;
+
     /** @brief Process sorted point schedule into bucket accumulators using batched affine additions */
     static void batch_accumulate_points_into_buckets(std::span<const uint64_t> point_schedule,
                                                      std::span<const AffineElement> points,
@@ -288,7 +296,20 @@ template <typename Curve> class MSM {
     static void transform_scalar_and_get_nonzero_scalar_indices(std::span<ScalarField> scalars,
                                                                 std::vector<uint32_t>& nonzero_scalar_indices) noexcept;
 
-    /** @brief Distribute multiple MSMs across threads with balanced point counts */
+    /** @brief Compute per-scalar slice-count weights ceil(bit_length / bits_per_slice).
+     *  @details Parallel over nonzero_indices. Scalars must be in non-Montgomery form (as left
+     *           by transform_scalar_and_get_nonzero_scalar_indices). Weights drive thread
+     *           partitioning in get_work_units. */
+    static void compute_scalar_slice_weights(std::span<const ScalarField> scalars,
+                                             std::span<const uint32_t> nonzero_indices,
+                                             uint32_t bits_per_slice,
+                                             std::vector<uint16_t>& weights) noexcept;
+
+    /** @brief Distribute multiple MSMs across threads with balanced bucket-accumulation work.
+     *  @details Per-thread assignment is a contiguous range of each MSM's nonzero-scalar
+     *           indices, sized by cumulative slice-count weight ceil(bit_length / c). This is
+     *           the actual number of nonzero c-bit slices a scalar contributes — the quantity
+     *           that drives bucket-accumulation cost. */
     static std::vector<ThreadWorkUnits> get_work_units(std::span<std::span<ScalarField>> scalars,
                                                        std::vector<std::vector<uint32_t>>& msm_scalar_indices) noexcept;
 
diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.test.cpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.test.cpp