AztecProtocol
diff --git a/‎barretenberg/cpp/src/barretenberg/ecc/groups/booth_recode.hpp‎
Lines changed: 106 additions & 0 deletions b/‎barretenberg/cpp/src/barretenberg/ecc/groups/booth_recode.hpp‎
Lines changed: 106 additions & 0 deletions
diff --git a/‎barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.fuzzer.cpp‎
Lines changed: 160 additions & 0 deletions b/‎barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.fuzzer.cpp‎
Lines changed: 160 additions & 0 deletions
@@ -0,0 +1,106 @@
+// Shared carry-less signed-Booth window slice parameters.
+//
+// Each window is a c-bit signed digit in [-2^(c-1), 2^(c-1)], read as a (c+1)-bit
+// slice that overlaps its lower neighbour by one bit; the shared boundary bit
+// substitutes for an explicit carry. This is the algorithm Constantine calls
+// `signedWindowEncoding` / `getSignedFullWindowAt`
+// (constantine/math/arithmetic/bigints.nim).
+//
+// The struct + `compute_booth_slice_params` live here so they can be shared
+// between:
+//   * `ecc/groups/element_impl.hpp` — the GLV-endo straus path uses a small
+//     fixed-window (c=4, 32 windows) Booth recoding;
+//   * `ecc/scalar_multiplication/pippenger_constantine.hpp` — the round-parallel
+//     Pippenger MSM uses the same recoding at runtime-chosen window sizes.
+// The two callers diverge on the packed-digit reader (perf-tuned multi-path +
+// SIMD x4 in MSM; simple branchless in element_impl) — only the slice-param
+// computation is shared.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+namespace bb::ecc::booth {
+
+/**
+ * @brief Per-window precomputed slice parameters for the carry-less signed-Booth
+ *        window recoding. Read out by the per-(point, window) hot loop as two i32
+ *        loads + a fixed bit-twiddle (no branches, no per-iter address arithmetic).
+ *
+ * `slice_localised_to_one_u64`: true iff every bit of the (c+1)-bit window lives
+ * inside a single uint64 limb. Callers that have a single-load fast path branch on
+ * this; callers that don't can ignore it (the field is one bool — zero cost).
+ */
+struct BoothSliceParams {
+    uint32_t lo_mask;
+    uint32_t hi_mask;
+    uint32_t lo_limb;
+    uint32_t hi_limb; // == lo_limb + 1, except clamped to last valid limb at the top window
+    uint32_t lo_off;
+    uint32_t lo_bits;
+    bool slice_localised_to_one_u64;
+};
+
+/**
+ * @brief Compute the Booth slice params for a window starting at absolute bit
+ *        position `bit_offset`. The slice is `[bit_offset - 1, bit_offset + window_bits)`;
+ *        the bit at `bit_offset - 1` is the shared boundary bit. The bottom window
+ *        (`bit_offset == 0`) is encoded specially so the same recoding algebra
+ *        applies — read "limb -1" as a zero-masked load.
+ *
+ *        `constexpr` so callers with compile-time window schedules
+ *        (`element_impl`'s GLV-endo 32-window table) can materialise the param
+ *        array at compile time, while runtime-schedule callers (Pippenger) use
+ *        the same function at runtime.
+ */
+[[nodiscard]] constexpr BoothSliceParams compute_booth_slice_params(size_t bit_offset,
+                                                                    size_t window_bits,
+                                                                    size_t num_uint64_limbs) noexcept
+{
+    constexpr size_t LIMB_BITS = 64;
+    BoothSliceParams sp{};
+    if (bit_offset == 0) {
+        // Bottom window: the boundary bit below the LSB is a synthetic 0. Encode this by
+        // reading "limb -1" as a zero-masked load (lo_mask = 0), then reading window_bits
+        // bits from limb 0 into the hi side and shifting them left by 1. This puts the
+        // window_bits-bit window at bits 1..window_bits with bit 0 = 0, matching the inner-
+        // loop body used by every other window. Not localised — the synthetic-lookback
+        // assembly only works in the slow path.
+        sp.lo_limb = 0; // safe in-range, but masked to 0
+        sp.hi_limb = 0; // = scalar limb 0
+        sp.lo_off = LIMB_BITS - 1;
+        sp.lo_bits = 1; // shifts hi_part left by 1, planting the window_bits-bit window at bits 1..window_bits
+        sp.lo_mask = 0; // lo_part contributes nothing
+        sp.hi_mask = (uint32_t{ 1 } << window_bits) - 1;
+        sp.slice_localised_to_one_u64 = false;
+    } else {
+        const size_t lookback_bit = bit_offset - 1;
+        const size_t bits_to_read = window_bits + 1;
+        sp.lo_limb = static_cast<uint32_t>(lookback_bit / LIMB_BITS);
+        sp.lo_off = static_cast<uint32_t>(lookback_bit & (LIMB_BITS - 1));
+        sp.lo_bits = static_cast<uint32_t>(LIMB_BITS - sp.lo_off < bits_to_read ? LIMB_BITS - sp.lo_off : bits_to_read);
+        const uint32_t hi_bits = static_cast<uint32_t>(bits_to_read) - sp.lo_bits;
+        // window_bits+1 ≤ 32 for our windows ⇒ lo_bits ≤ 32 ⇒ mask fits in uint32.
+        sp.lo_mask = (uint32_t{ 1 } << sp.lo_bits) - 1;
+        // If the natural hi-limb read would land past the end of the scalar's storage,
+        // clamp `hi_limb` to a safe in-range index and mask its contribution to zero. The
+        // top window's hi_bits worth of bits are conceptually zero (scalar < 2^num_bits ≤
+        // num_windows·window_bits). Re-reading lo_limb under a zero mask keeps the slow
+        // path's two unconditional limb loads branch-free.
+        if (static_cast<size_t>(sp.lo_limb) + 1 >= num_uint64_limbs) {
+            sp.hi_limb = sp.lo_limb;
+            sp.hi_mask = 0;
+        } else {
+            sp.hi_limb = sp.lo_limb + 1;
+            sp.hi_mask = (uint32_t{ 1 } << hi_bits) - 1;
+        }
+        // Fast path: the full (window_bits+1)-bit window lives inside `lo_limb`. hi_bits == 0
+        // captures both the in-limb case (window doesn't straddle a 64-bit boundary) and the
+        // clamped top-window case (above) where hi_mask was forced to 0.
+        sp.slice_localised_to_one_u64 = (hi_bits == 0);
+    }
+    return sp;
+}
+
+} // namespace bb::ecc::booth
@@ -0,0 +1,160 @@
+// libFuzzer target for the Constantine signed-Booth window recoder.
+//
+// Two-pronged differential check on each input:
+//   1. Scalar path vs textbook reference oracle — catches encoder algebra bugs.
+//   2. SIMD x4 path vs scalar path (lane-by-lane) — catches lane-mux / mask /
+//      vector-shift bugs in the three slice-path specialisations.
+//
+// Input layout: 1 byte window_bits ∈ [2, 18], 1 byte bit_offset ∈ [0, 254],
+// followed by 32 bytes × 4 = 128 bytes of scalar limb material. Total minimum
+// input = 130 bytes; smaller inputs are zero-padded so libFuzzer's empty-seed
+// kickoff still drives the encoder.
+//
+// Run:
+//   cmake --preset fuzzing && cmake --build --preset fuzzing --target ecc_pippenger_constantine_fuzzer
+//   ./build-fuzzing/bin/ecc_pippenger_constantine_fuzzer -max_total_time=60
+
+#include "pippenger_constantine.hpp"
+
+#include "barretenberg/numeric/uint256/uint256.hpp"
+
+#include <array>
+#include <cstdint>
+#include <cstring>
+
+namespace {
+
+namespace cnst = bb::scalar_multiplication::round_parallel_detail;
+
+constexpr size_t LIMB_BITS_U64 = 64;
+constexpr size_t NUM_LIMBS_U64 = 4;
+constexpr size_t NUM_LIMBS_U32 = 8;
+constexpr size_t MAX_BITS = 256;
+constexpr size_t SCALAR_BYTES = 32;
+
+uint32_t reference_packed_digit(const uint64_t* scalar_data, size_t bit_offset, size_t window_bits)
+{
+    auto bit_at = [&](int64_t i) -> uint64_t {
+        if (i < 0 || static_cast<size_t>(i) >= MAX_BITS) {
+            return 0;
+        }
+        return (scalar_data[static_cast<size_t>(i) / LIMB_BITS_U64] >> (static_cast<size_t>(i) % LIMB_BITS_U64)) &
+               uint64_t{ 1 };
+    };
+    uint32_t raw = 0;
+    for (size_t k = 0; k <= window_bits; ++k) {
+        const int64_t bit_idx = static_cast<int64_t>(bit_offset) + static_cast<int64_t>(k) - 1;
+        raw |= static_cast<uint32_t>(bit_at(bit_idx)) << k;
+    }
+    const uint32_t neg = (raw >> window_bits) & 1U;
+    const uint32_t val_mask = (uint32_t{ 1 } << window_bits) - 1;
+    const uint32_t encode = (raw + 1) >> 1;
+    const uint32_t bucket = ((encode - neg) ^ (uint32_t{ 0 } - neg)) & val_mask;
+    return (neg << 31) | bucket;
+}
+
+uint32_t production_scalar(const uint64_t* scalar_data, size_t bit_offset, size_t window_bits)
+{
+    const auto sp = cnst::compute_constantine_slice_params(bit_offset, window_bits, NUM_LIMBS_U64);
+    return cnst::get_constantine_packed_digit(scalar_data,
+                                              sp.lo_limb,
+                                              sp.hi_limb,
+                                              sp.lo_off,
+                                              sp.lo_bits,
+                                              sp.lo_mask,
+                                              sp.hi_mask,
+                                              sp.slice_localised_to_one_u64,
+                                              window_bits);
+}
+
+void production_simd(const std::array<std::array<uint64_t, NUM_LIMBS_U64>, 4>& scalars,
+                     size_t bit_offset,
+                     size_t window_bits,
+                     std::array<uint32_t, 4>& out)
+{
+    const auto sp = cnst::compute_constantine_slice_params_u32(bit_offset, window_bits, NUM_LIMBS_U32);
+    const cnst::SimdU32x4 lo_mask_v{ sp.lo_mask, sp.lo_mask, sp.lo_mask, sp.lo_mask };
+    const cnst::SimdU32x4 hi_mask_v{ sp.hi_mask, sp.hi_mask, sp.hi_mask, sp.hi_mask };
+    const cnst::SimdU32x4 one_v{ 1, 1, 1, 1 };
+    const uint32_t val_mask_scalar = (uint32_t{ 1 } << window_bits) - 1;
+    const cnst::SimdU32x4 val_mask{ val_mask_scalar, val_mask_scalar, val_mask_scalar, val_mask_scalar };
+    const auto* s0 = reinterpret_cast<const uint32_t*>(scalars[0].data());
+    const auto* s1 = reinterpret_cast<const uint32_t*>(scalars[1].data());
+    const auto* s2 = reinterpret_cast<const uint32_t*>(scalars[2].data());
+    const auto* s3 = reinterpret_cast<const uint32_t*>(scalars[3].data());
+    const auto wb_u32 = static_cast<uint32_t>(window_bits);
+
+    switch (cnst::classify_slice_path_u32(sp)) {
+    case cnst::ConstantineSlicePath::Localised:
+        cnst::store_constantine_packed_digits_x4_localised(
+            out.data(), s0, s1, s2, s3, sp.lo_limb, sp.lo_off, lo_mask_v, one_v, val_mask, wb_u32);
+        break;
+    case cnst::ConstantineSlicePath::Bottom:
+        cnst::store_constantine_packed_digits_x4_bottom(
+            out.data(), s0, s1, s2, s3, sp.hi_limb, sp.lo_bits, hi_mask_v, one_v, val_mask, wb_u32);
+        break;
+    case cnst::ConstantineSlicePath::Boundary:
+        cnst::store_constantine_packed_digits_x4_boundary(out.data(),
+                                                          s0,
+                                                          s1,
+                                                          s2,
+                                                          s3,
+                                                          sp.lo_limb,
+                                                          sp.hi_limb,
+                                                          sp.lo_off,
+                                                          sp.lo_bits,
+                                                          lo_mask_v,
+                                                          hi_mask_v,
+                                                          one_v,
+                                                          val_mask,
+                                                          wb_u32);
+        break;
+    }
+}
+
+} // namespace
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size)
+{
+    // Pad input to the minimum required length so empty / tiny seeds still
+    // exercise the encoder against zero-extended scalars.
+    constexpr size_t MIN_INPUT = 2 + (SCALAR_BYTES * 4);
+    std::array<uint8_t, MIN_INPUT> buf{};
+    std::memcpy(buf.data(), data, std::min(size, MIN_INPUT));
+
+    // window_bits ∈ [1, 19] — `choose_window_bits` returns [2,19]; the final
+    // window emitted by `build_var_window_schedule` can additionally be 1 bit
+    // (e.g. wb=3 over 256 bits = 85*3+1). Outside this range the encoder has
+    // no well-defined behavior in production.
+    const size_t window_bits = 1 + (buf[0] % 19);
+    // bit_offset ∈ [0, 255] — the live pipeline's range, including the top
+    // edge where bit_offset+wb extends past the scalar's 256 bits (production
+    // code clamps `hi_limb` and zeros `hi_mask`).
+    const size_t bit_offset = buf[1] & 0xff;
+
+    std::array<std::array<uint64_t, NUM_LIMBS_U64>, 4> scalars{};
+    for (size_t lane = 0; lane < 4; ++lane) {
+        std::memcpy(scalars[lane].data(), buf.data() + 2 + (lane * SCALAR_BYTES), SCALAR_BYTES);
+    }
+
+    // Check 1: scalar path matches the textbook reference oracle.
+    for (size_t lane = 0; lane < 4; ++lane) {
+        const uint32_t got = production_scalar(scalars[lane].data(), bit_offset, window_bits);
+        const uint32_t want = reference_packed_digit(scalars[lane].data(), bit_offset, window_bits);
+        if (got != want) {
+            __builtin_trap();
+        }
+    }
+
+    // Check 2: SIMD x4 path agrees with scalar path lane-by-lane.
+    std::array<uint32_t, 4> simd_out{};
+    production_simd(scalars, bit_offset, window_bits, simd_out);
+    for (size_t lane = 0; lane < 4; ++lane) {
+        const uint32_t want = production_scalar(scalars[lane].data(), bit_offset, window_bits);
+        if (simd_out[lane] != want) {
+            __builtin_trap();
+        }
+    }
+
+    return 0;
+}