fix: pippenger edge case (#22256)

iakovenkos · web-flow · commit ed6069d617d2 · 2026-04-02T13:32:54.000+02:00
Fixing a rare edge case caused by a bug in radix sort recursive calls
diff --git a/barretenberg/cpp/src/barretenberg/benchmark/goblin_bench/eccvm.bench.cpp b/barretenberg/cpp/src/barretenberg/benchmark/goblin_bench/eccvm.bench.cpp
@@ -1,8 +1,11 @@
 #include <benchmark/benchmark.h>
 
+#include "barretenberg/commitment_schemes/ipa/ipa.hpp"
+#include "barretenberg/ecc/curves/bn254/fq.hpp"
 #include "barretenberg/eccvm/eccvm_circuit_builder.hpp"
 #include "barretenberg/eccvm/eccvm_prover.hpp"
 #include "barretenberg/eccvm/eccvm_verifier.hpp"
+#include "barretenberg/srs/global_crs.hpp"
 
 using namespace benchmark;
 using namespace bb;
@@ -40,6 +43,9 @@ Builder generate_trace(size_t target_num_gates)
         op_queue->merge();
     }
 
+    using Fq = curve::BN254::BaseField;
+    op_queue->append_hiding_op(Fq::random_element(), Fq::random_element());
+
     Builder builder{ op_queue };
     return builder;
 }
@@ -63,12 +69,35 @@ void eccvm_prove(State& state) noexcept
     std::shared_ptr<Transcript> prover_transcript = std::make_shared<Transcript>();
     ECCVMProver prover(builder, prover_transcript);
     for (auto _ : state) {
-        auto [proof, ipa_claim] = prover.construct_proof();
+        auto [proof, opening_claim] = prover.construct_proof();
+        auto ipa_transcript = std::make_shared<Transcript>();
+        IPA<Flavor::Curve>::compute_opening_proof(prover.key->commitment_key, opening_claim, ipa_transcript);
+    };
+}
+
+void eccvm_ipa(State& state) noexcept
+{
+    size_t target_num_gates = 1 << static_cast<size_t>(state.range(0));
+    Builder builder = generate_trace(target_num_gates);
+    std::shared_ptr<Transcript> prover_transcript = std::make_shared<Transcript>();
+    ECCVMProver prover(builder, prover_transcript);
+    auto [proof, opening_claim] = prover.construct_proof();
+    for (auto _ : state) {
+        auto ipa_transcript = std::make_shared<Transcript>();
+        IPA<Flavor::Curve>::compute_opening_proof(prover.key->commitment_key, opening_claim, ipa_transcript);
     };
 }
 
 BENCHMARK(eccvm_generate_prover)->Unit(kMillisecond)->DenseRange(12, CONST_ECCVM_LOG_N);
 BENCHMARK(eccvm_prove)->Unit(kMillisecond)->DenseRange(12, CONST_ECCVM_LOG_N);
+BENCHMARK(eccvm_ipa)->Unit(kMillisecond)->DenseRange(12, CONST_ECCVM_LOG_N);
 } // namespace
 
-BENCHMARK_MAIN();
+int main(int argc, char** argv)
+{
+    bb::srs::init_file_crs_factory(bb::srs::bb_crs_path());
+    benchmark::Initialize(&argc, argv);
+    benchmark::RunSpecifiedBenchmarks();
+    benchmark::Shutdown();
+    return 0;
+}
diff --git a/barretenberg/cpp/src/barretenberg/commitment_schemes/commitment_key.test.cpp b/barretenberg/cpp/src/barretenberg/commitment_schemes/commitment_key.test.cpp
@@ -1,5 +1,6 @@
 
 #include "barretenberg/commitment_schemes/commitment_key.hpp"
+#include "barretenberg/common/thread.hpp"
 #include "barretenberg/srs/global_crs.hpp"
 
 #include <gtest/gtest.h>
@@ -125,6 +126,89 @@ template <typename Curve> class CommitmentKeyTest : public ::testing::Test {
         Commitment expected = commit_naive(ck, poly);
         EXPECT_EQ(expected, commitment);
     }
+
+    // Regression test for a zero-counting bug in Pippenger's MSD radix sort
+    // (sort_point_schedule_and_count_zero_buckets in process_buckets.cpp).
+    //
+    // The bug: the recursive radix sort passed `keys` instead of `top_level_keys` when recursing,
+    // causing the zero-entry counter to be overwritten by non-zero-bucket counts when the sort
+    // uses 3+ recursion levels. The inflated count makes the MSM skip valid point contributions.
+    //
+    // When does 3-level recursion occur?
+    //   - Pippenger chooses bits_per_slice via a cost model (get_optimal_log_num_buckets).
+    //   - bits_per_slice > 16 pads to 24 bits -> initial_shift=16 -> 3 levels (shift 16->8->0).
+    //   - For BN254 (254-bit scalars), bits_per_slice=17 at ~4.6M+ points per work unit.
+    //   - Multi-threading splits MSM across cores, so each work unit is total_points/num_threads.
+    //     On a 32-core machine, a single work unit reaches 4.6M at ~150M total points.
+    //   - Single-threaded execution (WASM, resource-constrained environments) hits the threshold
+    //     at 4.6M points directly.
+    //
+    // Polynomial design (deterministic, all coefficients non-zero):
+    //   get_scalar_slice extracts bits MSB-first. With bits_per_slice=17 and 15 rounds for BN254,
+    //   round 13 extracts bits [16:33) of each scalar. We choose scalar values so that round 13
+    //   has the bucket distribution needed to trigger the overwrite:
+    //
+    //     100 coefficients = Fr(1)      -> bits [16:33) = 0 -> bucket_index = 0
+    //     10  coefficients = Fr(2^16)   -> bits [16:33) = 1 -> bucket_index = 1    [DROPPED]
+    //     ~5M coefficients = Fr(2^32)   -> bits [16:33) = 2^16 -> bucket_index = 65536 [OVERWRITES]
+    //
+    //   Fr(1) entries must be non-zero (zero scalars are filtered before the MSM) but still
+    //   land in bucket 0 for round 13. They ensure point_schedule[0] has bucket_index=0 after
+    //   sorting, bypassing the post-sort safety check in sort_point_schedule_and_count_zero_buckets.
+    //
+    //   The bug overwrites num_zero_entries from 100 (correct) to ~5M (count at bucket 65536).
+    //   The MSM span then starts ~5M entries into the sorted schedule, skipping all 10 target
+    //   entries with bucket_index=1 and silently dropping their contributions.
+    //
+    //   This layout is chosen for efficiency (~1.5s) and full determinism (no random scalars).
+    //   The reference commitment is computed by chunking into 1M-point sub-MSMs, each using
+    //   bits_per_slice <= 15 (2-level sort, bug-free).
+    void test_pippenger_zero_count_regression()
+    {
+        constexpr size_t n = 5000000;
+        CK ck(n);
+
+        Polynomial poly(n);
+
+        constexpr size_t num_fake_zeros = 100;
+        for (size_t i = 0; i < num_fake_zeros; ++i) {
+            poly.at(i) = Fr(1);
+        }
+
+        constexpr size_t num_targets = 10;
+        for (size_t i = num_fake_zeros; i < num_fake_zeros + num_targets; ++i) {
+            poly.at(i) = Fr(65536);
+        }
+
+        for (size_t i = num_fake_zeros + num_targets; i < n; ++i) {
+            poly.at(i) = Fr(uint256_t(1) << 32);
+        }
+
+        // Commit single-threaded to keep the full point set in one work unit
+        size_t original_concurrency = get_num_cpus();
+        set_parallel_for_concurrency(1);
+        Commitment actual_commitment = ck.commit(poly);
+        set_parallel_for_concurrency(original_concurrency);
+
+        // Reference: sum of chunked sub-MSMs (each chunk uses bits_per_slice <= 15, bug-free)
+        constexpr size_t chunk_size = 1UL << 20;
+        auto srs_points = ck.get_monomial_points();
+        GroupElement correct_sum;
+        correct_sum.self_set_infinity();
+
+        for (size_t offset = 0; offset < n; offset += chunk_size) {
+            size_t this_chunk = std::min(chunk_size, n - offset);
+            std::span<const Fr> chunk_coeffs(&poly[offset], this_chunk);
+            PolynomialSpan<const Fr> chunk_span(0, chunk_coeffs);
+            std::span<const Commitment> chunk_points = srs_points.subspan(offset, this_chunk);
+
+            auto chunk_result = scalar_multiplication::pippenger_unsafe<Curve>(chunk_span, chunk_points);
+            correct_sum += chunk_result;
+        }
+        Commitment correct_commitment(correct_sum);
+
+        EXPECT_EQ(actual_commitment, correct_commitment);
+    }
 };
 
 using Curves = ::testing::Types<curve::BN254, curve::Grumpkin>;
@@ -154,5 +238,16 @@ TYPED_TEST(CommitmentKeyTest, CommitWithStartIndex)
 {
     TestFixture::test_commit_with_start_index();
 }
+TYPED_TEST(CommitmentKeyTest, DISABLED_PippengerZeroCountRegression)
+{
+    if constexpr (!std::is_same_v<TypeParam, curve::BN254>) {
+        GTEST_SKIP() << "BN254 only: Grumpkin CRS has insufficient points for the 5M threshold";
+    }
+#ifndef NDEBUG
+    GTEST_SKIP() << "Too slow in debug builds";
+#else
+    TestFixture::test_pippenger_zero_count_regression();
+#endif
+}
 
 } // namespace bb
diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/process_buckets.cpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/process_buckets.cpp
@@ -69,8 +69,12 @@ void radix_sort_count_zero_entries(uint64_t* keys,
         for (size_t i = 0; i < NUM_RADIX_BUCKETS; ++i) {
             const size_t bucket_size = offsets_copy[i + 1] - offsets_copy[i];
             if (bucket_size > 1) {
-                radix_sort_count_zero_entries(
-                    &keys[offsets_copy[i]], bucket_size, shift - RADIX_BITS, num_zero_entries, bucket_index_bits, keys);
+                radix_sort_count_zero_entries(&keys[offsets_copy[i]],
+                                              bucket_size,
+                                              shift - RADIX_BITS,
+                                              num_zero_entries,
+                                              bucket_index_bits,
+                                              top_level_keys);
             }
         }
     }
diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.test.cpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.test.cpp
@@ -224,6 +224,67 @@ template <class Curve> class ScalarMultiplicationTest : public ::testing::Test {
         }
     }
 
+    // Regression test: radix sort zero-counting bug for bucket_index_bits > 16 (3+ recursion levels).
+    // The recursive call passes `keys` instead of `top_level_keys`, causing num_zero_entries to be
+    // overwritten by non-zero-bucket counts when the MSD radix sort recurses 3+ levels deep.
+    void test_radix_sort_count_zero_entries_wide_buckets()
+    {
+        // Use bucket_index_bits = 17, which pads to 24 bits → 3 recursion levels (shift: 16→8→0).
+        // At the 3rd level, the top_level_keys bug causes zero-counting to fire for every
+        // level-0 bucket's sub-bucket-0, not just the bucket-0 chain.
+        constexpr uint32_t bucket_index_bits = 17;
+        constexpr size_t num_entries = 1000;
+
+        std::vector<uint64_t> schedule(num_entries);
+
+        // Place some entries with bucket_index = 0 (true zero-bucket entries)
+        const size_t num_true_zeros = 10;
+        for (size_t i = 0; i < num_true_zeros; ++i) {
+            schedule[i] = static_cast<uint64_t>(i) << 32; // point_index=i, bucket_index=0
+        }
+
+        // Place entries with bucket_index = 65536 (= 1 << 16). These have bits [0:16) all zero,
+        // so the buggy code counts them as zero-bucket entries after the final recursion level
+        // overwrites num_zero_entries from the level-0 bucket 1 path.
+        const size_t num_false_zeros = 20;
+        for (size_t i = 0; i < num_false_zeros; ++i) {
+            size_t idx = num_true_zeros + i;
+            schedule[idx] = (static_cast<uint64_t>(idx) << 32) | 65536ULL;
+        }
+
+        // Fill remaining entries with random non-zero bucket indices that won't confuse the count
+        for (size_t i = num_true_zeros + num_false_zeros; i < num_entries; ++i) {
+            uint32_t bucket = (engine.get_random_uint32() % ((1U << bucket_index_bits) - 1)) + 1;
+            // Avoid bucket_index values with all lower 16 bits zero (i.e., multiples of 65536)
+            if ((bucket & 0xFFFF) == 0) {
+                bucket |= 1;
+            }
+            schedule[i] = (static_cast<uint64_t>(i) << 32) | static_cast<uint64_t>(bucket);
+        }
+
+        size_t result = scalar_multiplication::sort_point_schedule_and_count_zero_buckets(
+            schedule.data(), num_entries, bucket_index_bits);
+
+        // Count actual zero-bucket entries after sort
+        size_t expected = 0;
+        for (size_t i = 0; i < num_entries; ++i) {
+            if ((schedule[i] & scalar_multiplication::BUCKET_INDEX_MASK) == 0) {
+                expected++;
+            }
+        }
+
+        EXPECT_EQ(result, expected) << "Zero-bucket count is wrong for bucket_index_bits=" << bucket_index_bits
+                                    << ". Got " << result << ", expected " << expected
+                                    << " (likely overwritten by count from a non-zero bucket)";
+
+        // Also verify the array is sorted
+        for (size_t i = 1; i < num_entries; ++i) {
+            uint32_t prev = static_cast<uint32_t>(schedule[i - 1]);
+            uint32_t curr = static_cast<uint32_t>(schedule[i]);
+            EXPECT_LE(prev, curr) << "Array not sorted at index " << i;
+        }
+    }
+
     void test_pippenger_low_memory()
     {
         std::span<ScalarField> test_scalars(&scalars[0], num_points);
@@ -571,6 +632,10 @@ TYPED_TEST(ScalarMultiplicationTest, RadixSortCountZeroEntries)
 {
     this->test_radix_sort_count_zero_entries();
 }
+TYPED_TEST(ScalarMultiplicationTest, RadixSortCountZeroEntriesWideBuckets)
+{
+    this->test_radix_sort_count_zero_entries_wide_buckets();
+}
 TYPED_TEST(ScalarMultiplicationTest, PippengerLowMemory)
 {
     this->test_pippenger_low_memory();

Original file line number	Diff line number	Diff line change
`@@ -69,8 +69,12 @@ void radix_sort_count_zero_entries(uint64_t* keys,`
`69`	`69`	`for (size_t i = 0; i < NUM_RADIX_BUCKETS; ++i) {`
`70`	`70`	`const size_t bucket_size = offsets_copy[i + 1] - offsets_copy[i];`
`71`	`71`	`if (bucket_size > 1) {`
`72`		`- radix_sort_count_zero_entries(`
`73`		`- &keys[offsets_copy[i]], bucket_size, shift - RADIX_BITS, num_zero_entries, bucket_index_bits, keys);`
	`72`	`+ radix_sort_count_zero_entries(&keys[offsets_copy[i]],`
	`73`	`+ bucket_size,`
	`74`	`+ shift - RADIX_BITS,`
	`75`	`+ num_zero_entries,`
	`76`	`+ bucket_index_bits,`
	`77`	`+ top_level_keys);`
`74`	`78`	`}`
`75`	`79`	`}`
`76`	`80`	`}`