From d76a3163082f0400e0616d9e91ae77b9f76aaa63 Mon Sep 17 00:00:00 2001
From: iakovenkos <sergey.s.yakovenko@gmail.com>
Date: Tue, 26 May 2026 13:48:50 +0000
Subject: [PATCH 1/2] feat: extract Constantine signed-Booth window recoder
 with tests + fuzzer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds pippenger_constantine.hpp as a standalone primitive carved out of the
upcoming round-parallel Pippenger MSM rewrite. Self-contained header in
bb::scalar_multiplication::round_parallel_detail with no callers on this
branch — landed first so the recoder gets reviewed and CI-gated as a
mathematical primitive rather than buried inside the larger MSM PR.

The recoder implements Constantine's signedWindowEncoding / getSignedFullWindowAt
(constantine/math/arithmetic/bigints.nim): each window reads (c+1) bits including
the previous window's top bit and lets that shared boundary bit substitute for an
explicit carry, producing a (sign | bucket) packed digit. Two parallel families:

  * Scalar path  — ConstantineSliceParams + get_constantine_packed_digit
                   (uint64-indexed limbs)
  * SIMD x4 path — ConstantineSliceParamsU32 + three specialised stores
                   (uint32-indexed limbs, GCC vector_size, 4 scalars/call)

The SIMD helpers split on slice-path (Localised / Bottom / Boundary) so the
per-window branch hoists out of the per-scalar inner loop.

Tests (pippenger_constantine.test.cpp, ~48 ms native / ~46 ms WASM):

  1. ScalarMatchesReferenceOracleAllWindowBits — production scalar path vs a
     textbook signed-window reference oracle. Sweeps window_bits in [1, 19]
     (covers the [2, 19] choose_window_bits range plus the wb=1 final window
     that build_var_window_schedule can emit) and bit_offset in [0, 255].
  2. SimdX4MatchesScalarPathLanewise — SIMD x4 ≡ scalar lane-by-lane across
     all three specialisations, with a coverage assertion that each fires.
  3. RoundTripIdentityMatchesScalarMod2N — sum_w (-1)^{sign_w} · bucket_w ·
     2^{B_w} ≡ scalar (mod 2^256). The load-bearing algebraic invariant.
  4. EdgeCases — zero scalar, bottom-window classifier, top-window limb
     clamp (incl. bit_offset 255), localised/boundary flag boundaries.
  5. NamedSliceShapes — 12 named structural cases (bottom_wb*, local_*,
     boundary_u64_at_{63,127,191}, boundary_u32_at_31, top_clamped,
     top_wb1_final) so a regression at one shape shows up as a named failure.
  6. ParamClassifierU64U32Consistency — asserts the u64 / u32 param structs
     encode the same lookback bit position and bottom-window classification,
     so a bug in one classifier alone is not masked by the final-digit oracle.

Fuzzer (pippenger_constantine.fuzzer.cpp):

  Differential libFuzzer target — for each (window_bits, bit_offset, 4 random
  scalars) checks (a) scalar path == reference oracle, (b) SIMD x4 == scalar
  per lane. Verified clean under both `fuzzing` (30M iters / 30s) and
  `fuzzing-asan` (30M iters / 46s) presets with a boundary-biased seed corpus
  pinned to {0, 1, 31/32/33, 63/64/65, 127/128, 191/192, 253/254/255}.

Minor follow-on cleanup: extracted simd_u32x4_store() helper to dedupe the
three `#ifdef __wasm_simd128__` blocks across the SIMD specialisations.
[[gnu::always_inline]] makes it post-inline byte-identical to the previous
open-coded form.
---
 .../pippenger_constantine.fuzzer.cpp          | 160 +++++++
 .../pippenger_constantine.hpp                 | 389 +++++++++++++++
 .../pippenger_constantine.test.cpp            | 443 ++++++++++++++++++
 3 files changed, 992 insertions(+)
 create mode 100644 barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.fuzzer.cpp
 create mode 100644 barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.hpp
 create mode 100644 barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.test.cpp

diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.fuzzer.cpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.fuzzer.cpp
new file mode 100644
index 000000000000..451a9baa31b9
--- /dev/null
+++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.fuzzer.cpp
@@ -0,0 +1,160 @@
+// libFuzzer target for the Constantine signed-Booth window recoder.
+//
+// Two-pronged differential check on each input:
+//   1. Scalar path vs textbook reference oracle — catches encoder algebra bugs.
+//   2. SIMD x4 path vs scalar path (lane-by-lane) — catches lane-mux / mask /
+//      vector-shift bugs in the three slice-path specialisations.
+//
+// Input layout: 1 byte window_bits ∈ [2, 18], 1 byte bit_offset ∈ [0, 254],
+// followed by 32 bytes × 4 = 128 bytes of scalar limb material. Total minimum
+// input = 130 bytes; smaller inputs are zero-padded so libFuzzer's empty-seed
+// kickoff still drives the encoder.
+//
+// Run:
+//   cmake --preset fuzzing && cmake --build --preset fuzzing --target ecc_pippenger_constantine_fuzzer
+//   ./build-fuzzing/bin/ecc_pippenger_constantine_fuzzer -max_total_time=60
+
+#include "pippenger_constantine.hpp"
+
+#include "barretenberg/numeric/uint256/uint256.hpp"
+
+#include <array>
+#include <cstdint>
+#include <cstring>
+
+namespace {
+
+namespace cnst = bb::scalar_multiplication::round_parallel_detail;
+
+constexpr size_t LIMB_BITS_U64 = 64;
+constexpr size_t NUM_LIMBS_U64 = 4;
+constexpr size_t NUM_LIMBS_U32 = 8;
+constexpr size_t MAX_BITS = 256;
+constexpr size_t SCALAR_BYTES = 32;
+
+uint32_t reference_packed_digit(const uint64_t* scalar_data, size_t bit_offset, size_t window_bits)
+{
+    auto bit_at = [&](int64_t i) -> uint64_t {
+        if (i < 0 || static_cast<size_t>(i) >= MAX_BITS) {
+            return 0;
+        }
+        return (scalar_data[static_cast<size_t>(i) / LIMB_BITS_U64] >> (static_cast<size_t>(i) % LIMB_BITS_U64)) &
+               uint64_t{ 1 };
+    };
+    uint32_t raw = 0;
+    for (size_t k = 0; k <= window_bits; ++k) {
+        const int64_t bit_idx = static_cast<int64_t>(bit_offset) + static_cast<int64_t>(k) - 1;
+        raw |= static_cast<uint32_t>(bit_at(bit_idx)) << k;
+    }
+    const uint32_t neg = (raw >> window_bits) & 1U;
+    const uint32_t val_mask = (uint32_t{ 1 } << window_bits) - 1;
+    const uint32_t encode = (raw + 1) >> 1;
+    const uint32_t bucket = ((encode - neg) ^ (uint32_t{ 0 } - neg)) & val_mask;
+    return (neg << 31) | bucket;
+}
+
+uint32_t production_scalar(const uint64_t* scalar_data, size_t bit_offset, size_t window_bits)
+{
+    const auto sp = cnst::compute_constantine_slice_params(bit_offset, window_bits, NUM_LIMBS_U64);
+    return cnst::get_constantine_packed_digit(scalar_data,
+                                              sp.lo_limb,
+                                              sp.hi_limb,
+                                              sp.lo_off,
+                                              sp.lo_bits,
+                                              sp.lo_mask,
+                                              sp.hi_mask,
+                                              sp.slice_localised_to_one_u64,
+                                              window_bits);
+}
+
+void production_simd(const std::array<std::array<uint64_t, NUM_LIMBS_U64>, 4>& scalars,
+                     size_t bit_offset,
+                     size_t window_bits,
+                     std::array<uint32_t, 4>& out)
+{
+    const auto sp = cnst::compute_constantine_slice_params_u32(bit_offset, window_bits, NUM_LIMBS_U32);
+    const cnst::SimdU32x4 lo_mask_v{ sp.lo_mask, sp.lo_mask, sp.lo_mask, sp.lo_mask };
+    const cnst::SimdU32x4 hi_mask_v{ sp.hi_mask, sp.hi_mask, sp.hi_mask, sp.hi_mask };
+    const cnst::SimdU32x4 one_v{ 1, 1, 1, 1 };
+    const uint32_t val_mask_scalar = (uint32_t{ 1 } << window_bits) - 1;
+    const cnst::SimdU32x4 val_mask{ val_mask_scalar, val_mask_scalar, val_mask_scalar, val_mask_scalar };
+    const auto* s0 = reinterpret_cast<const uint32_t*>(scalars[0].data());
+    const auto* s1 = reinterpret_cast<const uint32_t*>(scalars[1].data());
+    const auto* s2 = reinterpret_cast<const uint32_t*>(scalars[2].data());
+    const auto* s3 = reinterpret_cast<const uint32_t*>(scalars[3].data());
+    const auto wb_u32 = static_cast<uint32_t>(window_bits);
+
+    switch (cnst::classify_slice_path_u32(sp)) {
+    case cnst::ConstantineSlicePath::Localised:
+        cnst::store_constantine_packed_digits_x4_localised(
+            out.data(), s0, s1, s2, s3, sp.lo_limb, sp.lo_off, lo_mask_v, one_v, val_mask, wb_u32);
+        break;
+    case cnst::ConstantineSlicePath::Bottom:
+        cnst::store_constantine_packed_digits_x4_bottom(
+            out.data(), s0, s1, s2, s3, sp.hi_limb, sp.lo_bits, hi_mask_v, one_v, val_mask, wb_u32);
+        break;
+    case cnst::ConstantineSlicePath::Boundary:
+        cnst::store_constantine_packed_digits_x4_boundary(out.data(),
+                                                          s0,
+                                                          s1,
+                                                          s2,
+                                                          s3,
+                                                          sp.lo_limb,
+                                                          sp.hi_limb,
+                                                          sp.lo_off,
+                                                          sp.lo_bits,
+                                                          lo_mask_v,
+                                                          hi_mask_v,
+                                                          one_v,
+                                                          val_mask,
+                                                          wb_u32);
+        break;
+    }
+}
+
+} // namespace
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size)
+{
+    // Pad input to the minimum required length so empty / tiny seeds still
+    // exercise the encoder against zero-extended scalars.
+    constexpr size_t MIN_INPUT = 2 + (SCALAR_BYTES * 4);
+    std::array<uint8_t, MIN_INPUT> buf{};
+    std::memcpy(buf.data(), data, std::min(size, MIN_INPUT));
+
+    // window_bits ∈ [1, 19] — `choose_window_bits` returns [2,19]; the final
+    // window emitted by `build_var_window_schedule` can additionally be 1 bit
+    // (e.g. wb=3 over 256 bits = 85*3+1). Outside this range the encoder has
+    // no well-defined behavior in production.
+    const size_t window_bits = 1 + (buf[0] % 19);
+    // bit_offset ∈ [0, 255] — the live pipeline's range, including the top
+    // edge where bit_offset+wb extends past the scalar's 256 bits (production
+    // code clamps `hi_limb` and zeros `hi_mask`).
+    const size_t bit_offset = buf[1] & 0xff;
+
+    std::array<std::array<uint64_t, NUM_LIMBS_U64>, 4> scalars{};
+    for (size_t lane = 0; lane < 4; ++lane) {
+        std::memcpy(scalars[lane].data(), buf.data() + 2 + (lane * SCALAR_BYTES), SCALAR_BYTES);
+    }
+
+    // Check 1: scalar path matches the textbook reference oracle.
+    for (size_t lane = 0; lane < 4; ++lane) {
+        const uint32_t got = production_scalar(scalars[lane].data(), bit_offset, window_bits);
+        const uint32_t want = reference_packed_digit(scalars[lane].data(), bit_offset, window_bits);
+        if (got != want) {
+            __builtin_trap();
+        }
+    }
+
+    // Check 2: SIMD x4 path agrees with scalar path lane-by-lane.
+    std::array<uint32_t, 4> simd_out{};
+    production_simd(scalars, bit_offset, window_bits, simd_out);
+    for (size_t lane = 0; lane < 4; ++lane) {
+        const uint32_t want = production_scalar(scalars[lane].data(), bit_offset, window_bits);
+        if (simd_out[lane] != want) {
+            __builtin_trap();
+        }
+    }
+
+    return 0;
+}
diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.hpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.hpp
new file mode 100644
index 000000000000..ec2c3e6800b2
--- /dev/null
+++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.hpp
@@ -0,0 +1,389 @@
+// Constantine-style signed-Booth window recoder for Pippenger MSM.
+//
+// Given a scalar s = sum_i s_i 2^i and a window [b, b + c), this module computes a
+// signed digit d in [-(2^c - 1), 2^c - 1] such that the scalar can be reconstructed as
+// s = sum_w d_w 2^{b_w}. It returns d as a packed `(sign | bucket)` value, where
+// `bucket = |d|` and `sign` records whether d is negative.
+//
+// Implements the carry-less `signedWindowEncoding` / `getSignedFullWindowAt` pattern from
+// `constantine/math/arithmetic/bigints.nim`: each window reads c+1 bits including the
+// previous window boundary bit, lets that shared boundary bit substitute for an explicit
+// carry, and produces a `(sign | bucket)` packed digit.
+//
+// Assumptions: production callers pass `window_bits` in [1, 19] and bit offsets within a
+// 256-bit scalar. The bit-twiddling below assumes `window_bits < 32`.
+//
+// Two parallel paths:
+//   * scalar path  — `ConstantineSliceParams` + `get_constantine_packed_digit` (uint64-
+//     indexed limbs).
+//   * SIMD x4 path — `ConstantineSliceParamsU32` + `store_constantine_packed_digits_x4_*`
+//     (uint32-indexed limbs, processes 4 scalars per call via GCC vector_size).
+//
+// The SIMD helpers split on slice-path (Localised / Bottom / Boundary) so the per-window
+// branch is hoisted out of the per-scalar loop. `classify_slice_path_u32` returns the
+// matching enum for callers to dispatch on once per window.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+#ifdef __wasm_simd128__
+#include <wasm_simd128.h>
+#endif
+
+namespace bb::scalar_multiplication::round_parallel_detail {
+
+/**
+ * @brief Per-window precomputed slice parameters for the carry-less signed-Booth window
+ *        recoding (after Constantine `signedWindowEncoding` / `getSignedFullWindowAt`,
+ *        `constantine/math/arithmetic/bigints.nim`). Computed once per window by the
+ *        caller; the per-scalar hot path is then fixed bit-twiddling with no per-iteration
+ *        slice address arithmetic.
+ *        Carry-less because every non-bottom window's c+1-bit read shares its boundary bit
+ *        with the previous window — the bit a non-overlapping recoder would carry.
+ *
+ * `slice_localised_to_one_u64`: true iff every bit of the c+1-bit window lives inside a
+ * single uint64 limb. Most windows on typical 254-bit scalars with c in [12, 19]
+ * (lookback bits at non-boundary positions) hit this and take the fast path: one load,
+ * one shift, one mask. The slow path is the boundary-straddling case + the synthetic-
+ * lookback bottom window.
+ */
+struct ConstantineSliceParams {
+    uint32_t lo_mask;
+    uint32_t hi_mask;
+    uint32_t lo_limb;
+    uint32_t hi_limb; // == lo_limb + 1, except clamped to last valid limb at the top window
+    uint32_t lo_off;
+    uint32_t lo_bits;
+    bool slice_localised_to_one_u64;
+};
+
+/**
+ * @brief Compute the Constantine slice params for a window starting at absolute bit position
+ *        `bit_offset` (= Σ_{k<w} window_bits_k under variable-window, or w·window_bits under
+ *        uniform-window). The slice is `[bit_offset - 1, bit_offset + window_bits)`; the bit at
+ *        bit_offset - 1 is the shared boundary bit. The bottom window (bit_offset == 0) is
+ *        encoded specially so the same recoding algebra applies.
+ */
+[[nodiscard]] inline ConstantineSliceParams compute_constantine_slice_params(size_t bit_offset,
+                                                                             size_t window_bits,
+                                                                             size_t num_uint64_limbs) noexcept
+{
+    constexpr size_t LIMB_BITS = 64;
+    ConstantineSliceParams sp;
+    if (bit_offset == 0) {
+        // Bottom window: the boundary bit below the LSB is a synthetic 0. Encode this by
+        // reading "limb -1" as a zero-masked load (lo_mask = 0), then reading window_bits
+        // bits from limb 0 into the hi side and shifting them left by 1. This puts the
+        // window_bits-bit window at bits 1..window_bits with bit 0 = 0, matching the inner-
+        // loop body used by every other window. Not localised — the synthetic-lookback
+        // assembly only works in the slow path.
+        sp.lo_limb = 0; // safe in-range, but masked to 0
+        sp.hi_limb = 0; // = scalar limb 0
+        sp.lo_off = LIMB_BITS - 1;
+        sp.lo_bits = 1; // shifts hi_part left by 1, planting the window_bits-bit window at bits 1..window_bits
+        sp.lo_mask = 0; // lo_part contributes nothing
+        sp.hi_mask = (uint32_t{ 1 } << window_bits) - 1;
+        sp.slice_localised_to_one_u64 = false;
+    } else {
+        const size_t lookback_bit = bit_offset - 1;
+        const size_t bits_to_read = window_bits + 1;
+        sp.lo_limb = static_cast<uint32_t>(lookback_bit / LIMB_BITS);
+        sp.lo_off = static_cast<uint32_t>(lookback_bit & (LIMB_BITS - 1));
+        sp.lo_bits = static_cast<uint32_t>(LIMB_BITS - sp.lo_off < bits_to_read ? LIMB_BITS - sp.lo_off : bits_to_read);
+        const uint32_t hi_bits = static_cast<uint32_t>(bits_to_read) - sp.lo_bits;
+        // window_bits+1 ≤ 32 for our windows ⇒ lo_bits ≤ 32 ⇒ mask fits in uint32.
+        sp.lo_mask = (uint32_t{ 1 } << sp.lo_bits) - 1;
+        // If the natural hi-limb read would land past the end of the scalar's storage,
+        // clamp `hi_limb` to a safe in-range index and mask its contribution to zero. The
+        // top window's hi_bits worth of bits are conceptually zero (scalar < 2^num_bits ≤
+        // num_windows·window_bits). Re-reading lo_limb under a zero mask keeps the slow
+        // path's two unconditional limb loads branch-free.
+        if (static_cast<size_t>(sp.lo_limb) + 1 >= num_uint64_limbs) {
+            sp.hi_limb = sp.lo_limb;
+            sp.hi_mask = 0;
+        } else {
+            sp.hi_limb = sp.lo_limb + 1;
+            sp.hi_mask = (uint32_t{ 1 } << hi_bits) - 1;
+        }
+        // Fast path: the full (window_bits+1)-bit window lives inside `lo_limb`. hi_bits == 0
+        // captures both the in-limb case (window doesn't straddle a 64-bit boundary) and the
+        // clamped top-window case (above) where hi_mask was forced to 0.
+        sp.slice_localised_to_one_u64 = (hi_bits == 0);
+    }
+    return sp;
+}
+
+/**
+ * @brief Read (window_bits+1) bits from `scalar_data` (uint64 limbs) using precomputed
+ *        slice params and apply Constantine's signedWindowEncoding to produce a
+ *        `(sign | bucket)` packed digit.
+ *
+ *        Takes the slice params as scalar value parameters rather than a struct reference
+ *        so the compiler can keep them in registers across the caller loop.
+ *
+ *        `slice_localised_to_one_u64` selects the single-load fast path: ~75% of windows
+ *        on typical 254-bit scalars (window_bits in [12, 19]) hit this.
+ */
+[[nodiscard]] [[gnu::always_inline]] inline uint32_t get_constantine_packed_digit(const uint64_t* scalar_data,
+                                                                                  uint32_t lo_limb,
+                                                                                  uint32_t hi_limb,
+                                                                                  uint32_t lo_off,
+                                                                                  uint32_t lo_bits,
+                                                                                  uint32_t lo_mask,
+                                                                                  uint32_t hi_mask,
+                                                                                  bool slice_localised_to_one_u64,
+                                                                                  size_t window_bits) noexcept
+{
+    uint64_t raw_wide = 0;
+    if (slice_localised_to_one_u64) {
+        // Fast path: one load + shift + mask. hi_part vanishes (hi_mask == 0); skip it.
+        raw_wide = (scalar_data[lo_limb] >> lo_off) & lo_mask;
+    } else if (lo_mask == 0) {
+        // Bottom-window fast path: synthetic-zero lookback bit, so the lo_part contribution is
+        // always 0 (lo_mask == 0). Skip the lo limb load entirely. lo_bits == 1 here, so the
+        // shift plants the window_bits-bit slice at bits 1..window_bits with bit 0 = 0.
+        // sp_lo_mask is loop-invariant within a window but is a runtime stack value, so the
+        // compiler does NOT constant-fold the `(s_lo >> lo_off) & 0 = 0` path inside the
+        // boundary branch; this explicit check saves ~3 ALU ops per scalar on the bottom window.
+        raw_wide = (scalar_data[hi_limb] & hi_mask) << lo_bits;
+    } else {
+        // Slow path: window straddles a uint64 boundary.
+        const uint64_t s_lo = scalar_data[lo_limb];
+        const uint64_t s_hi = scalar_data[hi_limb];
+        const uint64_t lo_part = (s_lo >> lo_off) & lo_mask;
+        const uint64_t hi_part = (s_hi & hi_mask) << lo_bits;
+        raw_wide = lo_part | hi_part;
+    }
+    // raw fits in window_bits+1 ≤ 32 bits, safe to narrow.
+    const uint32_t raw = static_cast<uint32_t>(raw_wide);
+
+    // signedWindowEncoding(raw, window_bits). raw fits in window_bits+1 bits; bit
+    // `window_bits` is the sign indicator.
+    //
+    // The conditional-negate trick `((encode + neg_mask) ^ neg_mask)` is the standard
+    // branchless idiom. We use the equivalent `(encode - neg) ^ neg_mask` to break the
+    // latency chain: `encode - neg` and `neg_mask = -neg` can issue in parallel (both
+    // depend only on `neg` / `encode`), whereas `encode + neg_mask` first waits for
+    // `neg_mask` to materialise. Saves one cycle on the inner-loop critical path
+    // (neg → neg_mask → +neg_mask → ^neg_mask → &val_mask vs neg → {neg_mask, enc_neg}
+    // in parallel → ^neg_mask → &val_mask). Identical result by:
+    //   neg=0: enc_neg = encode, xored = encode ^ 0 = encode. ✓
+    //   neg=1: enc_neg = encode−1, xored = (encode−1) ^ −1 = ~(encode−1) = −encode. ✓
+    const uint32_t neg = (raw >> window_bits) & uint32_t{ 1 };
+    const uint32_t neg_mask = uint32_t{ 0 } - neg; // 0 or 0xFFFFFFFF
+    const uint32_t val_mask = (uint32_t{ 1 } << window_bits) - 1;
+    const uint32_t encode = (raw + 1) >> 1;
+    const uint32_t bucket_idx = ((encode - neg) ^ neg_mask) & val_mask;
+
+    // Pack into (sign | bucket): sign in bit 31, bucket magnitude in the low bits.
+    return (neg << 31) | bucket_idx;
+}
+
+// 128-bit SIMD-friendly 4-wide variant of get_constantine_packed_digit. Computes 4 packed
+// digits in parallel via GCC's vector_size extension, which lowers to native SIMD on x86
+// (SSE2), ARM (NEON), and WASM (wasm-simd128). The branch on slice path is hoisted from
+// the per-call site to the per-window outer loop, so callers select the localised / bottom /
+// boundary specialisation once per window.
+//
+// We index the scalar via a `const uint32_t*` view rather than the natural `uint64_t*`:
+// each lane is one uint32, so a 128-bit SIMD register holds 4 (raw, encode, bucket, …)
+// values. `scalar.data` is a `std::array<uint64_t, 4>` whose byte layout is identical to
+// `uint32_t[8]` on every target we ship to (x86 / ARM / WASM are all little-endian, and the
+// codebase already assumes this layout in many places — `from_montgomery`, `uint256_t`,
+// etc.). The reinterpret_cast is the same alias pattern.
+//
+// Returns the four packed digits in `out[0..3]`. The caller scatters them individually,
+// since the consuming writes are not vectorisable. Switching from 2-wide uint64 to 4-wide
+// uint32 doubles the compute throughput per SIMD instruction at the cost of slightly more
+// straddle hits.
+using SimdU32x4 = uint32_t __attribute__((vector_size(16)));
+
+// Helpers return `SimdU32x4` directly so the v128 stays in the SIMD register file end-to-end.
+// Wrapping in a 4-uint32 struct round-tripped the v128 through 4 scalar memory slots.
+
+// uint32-indexed Constantine slice params, mirroring `ConstantineSliceParams` but with
+// limb indices measured in 32-bit (rather than 64-bit) chunks. Computed once per window in
+// `compute_constantine_slice_params_u32`; consumed by the SIMD x4 helpers below.
+struct ConstantineSliceParamsU32 {
+    uint32_t lo_mask;
+    uint32_t hi_mask;
+    uint32_t lo_limb; // u32 limb index of the lookback bit
+    uint32_t hi_limb; // == lo_limb + 1, clamped to last in-range u32 limb at the top window
+    uint32_t lo_off;  // bit-offset of the lookback bit within `lo_limb`
+    uint32_t lo_bits; // # bits read from `lo_limb` (also acts as the hi_part left-shift amount)
+    bool slice_localised_to_one_u32;
+    bool is_bottom_window;
+};
+
+[[nodiscard]] inline ConstantineSliceParamsU32 compute_constantine_slice_params_u32(size_t bit_offset,
+                                                                                    size_t window_bits,
+                                                                                    size_t num_u32_limbs) noexcept
+{
+    constexpr size_t LIMB_BITS_U32 = 32;
+    ConstantineSliceParamsU32 sp;
+    if (bit_offset == 0) {
+        sp.lo_limb = 0;
+        sp.hi_limb = 0;
+        sp.lo_off = LIMB_BITS_U32 - 1;
+        sp.lo_bits = 1;
+        sp.lo_mask = 0;
+        sp.hi_mask = (uint32_t{ 1 } << window_bits) - 1;
+        sp.slice_localised_to_one_u32 = false;
+        sp.is_bottom_window = true;
+    } else {
+        const size_t lookback_bit = bit_offset - 1;
+        const size_t bits_to_read = window_bits + 1;
+        sp.lo_limb = static_cast<uint32_t>(lookback_bit / LIMB_BITS_U32);
+        sp.lo_off = static_cast<uint32_t>(lookback_bit & (LIMB_BITS_U32 - 1));
+        const uint32_t in_lo = static_cast<uint32_t>(LIMB_BITS_U32 - sp.lo_off);
+        sp.lo_bits = (in_lo < static_cast<uint32_t>(bits_to_read)) ? in_lo : static_cast<uint32_t>(bits_to_read);
+        const uint32_t hi_bits = static_cast<uint32_t>(bits_to_read) - sp.lo_bits;
+        sp.lo_mask = (sp.lo_bits == LIMB_BITS_U32) ? ~uint32_t{ 0 } : ((uint32_t{ 1 } << sp.lo_bits) - 1);
+        if (static_cast<size_t>(sp.lo_limb) + 1 >= num_u32_limbs) {
+            sp.hi_limb = sp.lo_limb;
+            sp.hi_mask = 0;
+        } else {
+            sp.hi_limb = sp.lo_limb + 1;
+            sp.hi_mask = (uint32_t{ 1 } << hi_bits) - 1;
+        }
+        sp.slice_localised_to_one_u32 = (hi_bits == 0);
+        sp.is_bottom_window = false;
+    }
+    return sp;
+}
+
+// Gather 4 disjoint uint32 values into one v128 via wasm v128.load32_lane. On WASM this
+// is 1 splat + 3 load32_lane (4 ops); brace-init `{a, b, c, d}` with runtime values emits
+// 4 scalar i32.load + 1 splat + 3 replace_lane (8 ops). On native it falls back to brace-
+// init which clang lowers to NEON ins / SSE2 pinsrd.
+[[nodiscard]] [[gnu::always_inline]] inline SimdU32x4 gather_x4_u32(
+    const uint32_t* p0, const uint32_t* p1, const uint32_t* p2, const uint32_t* p3, uint32_t idx) noexcept
+{
+#ifdef __wasm_simd128__
+    v128_t v = wasm_i32x4_splat(0);
+    v = wasm_v128_load32_lane(p0 + idx, v, 0);
+    v = wasm_v128_load32_lane(p1 + idx, v, 1);
+    v = wasm_v128_load32_lane(p2 + idx, v, 2);
+    v = wasm_v128_load32_lane(p3 + idx, v, 3);
+    return reinterpret_cast<SimdU32x4>(v);
+#else
+    return SimdU32x4{ p0[idx], p1[idx], p2[idx], p3[idx] };
+#endif
+}
+
+// Store a `SimdU32x4` to a 4-lane uint32 destination as a single 128-bit op.
+// On WASM the explicit `wasm_v128_store` is used because earlier codegen for
+// the equivalent struct-wrapper assignment was observed to round-trip the
+// vector through 4 scalar memory slots; the intrinsic guarantees the
+// `i32x4.store` opcode. On native the `vector_size` store lowers directly to
+// SSE2 `movdqu` / NEON `st1`.
+[[gnu::always_inline]] inline void simd_u32x4_store(uint32_t* dst, SimdU32x4 v) noexcept
+{
+#ifdef __wasm_simd128__
+    wasm_v128_store(dst, reinterpret_cast<v128_t>(v));
+#else
+    *reinterpret_cast<SimdU32x4*>(dst) = v;
+#endif
+}
+
+// All four mask / constant v128s (lo_mask_v, hi_mask_v, one_v, val_mask) are loop-invariant
+// within a window. Callers build them ONCE per window in the outer-w loop and pass them in,
+// so the inner-i compute loop has zero v128.const / splat / shl+sub for the masks.
+// `neg_mask = -neg` uses GCC vector-ext unary minus which lowers to `i32x4.neg` on WASM.
+//
+// Helpers write the v128 result directly into the caller-provided 4-lane destination buffer.
+[[gnu::always_inline]] inline void store_constantine_packed_digits_x4_localised(uint32_t* dst,
+                                                                                const uint32_t* scalar_data_0,
+                                                                                const uint32_t* scalar_data_1,
+                                                                                const uint32_t* scalar_data_2,
+                                                                                const uint32_t* scalar_data_3,
+                                                                                uint32_t lo_limb,
+                                                                                uint32_t lo_off,
+                                                                                SimdU32x4 lo_mask_v,
+                                                                                SimdU32x4 one_v,
+                                                                                SimdU32x4 val_mask,
+                                                                                uint32_t window_bits) noexcept
+{
+    const SimdU32x4 lo = gather_x4_u32(scalar_data_0, scalar_data_1, scalar_data_2, scalar_data_3, lo_limb);
+    const SimdU32x4 raw = (lo >> lo_off) & lo_mask_v;
+    const SimdU32x4 neg = (raw >> window_bits) & one_v;
+    const SimdU32x4 neg_mask = -neg;
+    const SimdU32x4 encode = (raw + one_v) >> 1;
+    const SimdU32x4 bucket = ((encode - neg) ^ neg_mask) & val_mask;
+    const SimdU32x4 packed = (neg << 31) | bucket;
+    simd_u32x4_store(dst, packed);
+}
+
+[[gnu::always_inline]] inline void store_constantine_packed_digits_x4_bottom(uint32_t* dst,
+                                                                             const uint32_t* scalar_data_0,
+                                                                             const uint32_t* scalar_data_1,
+                                                                             const uint32_t* scalar_data_2,
+                                                                             const uint32_t* scalar_data_3,
+                                                                             uint32_t hi_limb,
+                                                                             uint32_t lo_bits,
+                                                                             SimdU32x4 hi_mask_v,
+                                                                             SimdU32x4 one_v,
+                                                                             SimdU32x4 val_mask,
+                                                                             uint32_t window_bits) noexcept
+{
+    const SimdU32x4 hi = gather_x4_u32(scalar_data_0, scalar_data_1, scalar_data_2, scalar_data_3, hi_limb);
+    const SimdU32x4 raw = (hi & hi_mask_v) << lo_bits;
+    const SimdU32x4 neg = (raw >> window_bits) & one_v;
+    const SimdU32x4 neg_mask = -neg;
+    const SimdU32x4 encode = (raw + one_v) >> 1;
+    const SimdU32x4 bucket = ((encode - neg) ^ neg_mask) & val_mask;
+    const SimdU32x4 packed = (neg << 31) | bucket;
+    simd_u32x4_store(dst, packed);
+}
+
+[[gnu::always_inline]] inline void store_constantine_packed_digits_x4_boundary(uint32_t* dst,
+                                                                               const uint32_t* scalar_data_0,
+                                                                               const uint32_t* scalar_data_1,
+                                                                               const uint32_t* scalar_data_2,
+                                                                               const uint32_t* scalar_data_3,
+                                                                               uint32_t lo_limb,
+                                                                               uint32_t hi_limb,
+                                                                               uint32_t lo_off,
+                                                                               uint32_t lo_bits,
+                                                                               SimdU32x4 lo_mask_v,
+                                                                               SimdU32x4 hi_mask_v,
+                                                                               SimdU32x4 one_v,
+                                                                               SimdU32x4 val_mask,
+                                                                               uint32_t window_bits) noexcept
+{
+    const SimdU32x4 lo = gather_x4_u32(scalar_data_0, scalar_data_1, scalar_data_2, scalar_data_3, lo_limb);
+    const SimdU32x4 hi = gather_x4_u32(scalar_data_0, scalar_data_1, scalar_data_2, scalar_data_3, hi_limb);
+    const SimdU32x4 lo_part = (lo >> lo_off) & lo_mask_v;
+    const SimdU32x4 hi_part = (hi & hi_mask_v) << lo_bits;
+    const SimdU32x4 raw = lo_part | hi_part;
+    const SimdU32x4 neg = (raw >> window_bits) & one_v;
+    const SimdU32x4 neg_mask = -neg;
+    const SimdU32x4 encode = (raw + one_v) >> 1;
+    const SimdU32x4 bucket = ((encode - neg) ^ neg_mask) & val_mask;
+    const SimdU32x4 packed = (neg << 31) | bucket;
+    simd_u32x4_store(dst, packed);
+}
+
+// Path-selector enum used to dispatch on the SIMD specialisation once per window rather
+// than once per scalar.
+enum class ConstantineSlicePath : uint8_t {
+    Localised = 0,
+    Bottom = 1,
+    Boundary = 2,
+};
+
+[[nodiscard]] [[gnu::always_inline]] inline ConstantineSlicePath classify_slice_path_u32(
+    const ConstantineSliceParamsU32& sp) noexcept
+{
+    if (sp.is_bottom_window) {
+        return ConstantineSlicePath::Bottom;
+    }
+    if (sp.slice_localised_to_one_u32) {
+        return ConstantineSlicePath::Localised;
+    }
+    return ConstantineSlicePath::Boundary;
+}
+
+} // namespace bb::scalar_multiplication::round_parallel_detail
diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.test.cpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.test.cpp
new file mode 100644
index 000000000000..199166832bc7
--- /dev/null
+++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.test.cpp
@@ -0,0 +1,443 @@
+// Unit tests for the Constantine signed-Booth window recoder used by the
+// round-parallel Pippenger MSM. Validates the scalar packed-digit recoder,
+// the SIMD x4 specialisations (Localised / Bottom / Boundary), and the
+// round-trip identity `Σ_w (-1)^sign_w · bucket_w · 2^{B_w} ≡ scalar`.
+
+#include "pippenger_constantine.hpp"
+
+#include "barretenberg/ecc/curves/bn254/fr.hpp"
+#include "barretenberg/numeric/random/engine.hpp"
+#include "barretenberg/numeric/uint256/uint256.hpp"
+
+#include <array>
+#include <cstdint>
+#include <gtest/gtest.h>
+#include <vector>
+
+namespace {
+
+namespace cnst = bb::scalar_multiplication::round_parallel_detail;
+using ScalarField = bb::fr;
+auto& engine = bb::numeric::get_randomness();
+
+constexpr size_t LIMB_BITS_U64 = 64;
+constexpr size_t NUM_LIMBS_U64 = 4;
+constexpr size_t NUM_LIMBS_U32 = 8;
+constexpr size_t MAX_BITS = 256;
+
+// =============================================================================
+// Reference signed-window encoder. Reads `(window_bits + 1)` bits from the
+// scalar starting at `bit_offset - 1` (with a synthetic 0 at bit -1 when
+// bit_offset == 0), then applies the signed-Booth encode:
+//
+//   raw  = bits [bit_offset-1, bit_offset + window_bits)
+//   neg  = raw >> window_bits           (top bit = sign indicator)
+//   encode = (raw + 1) >> 1             (drop the lookback bit)
+//   bucket = (encode - neg) ^ (-neg)    (conditional negate, branchless)
+//   packed = (neg << 31) | bucket
+//
+// Same algebra as `get_constantine_packed_digit`, but implemented in the most
+// obvious way against a flat `bit_at(i)` accessor so any error in the
+// production path's limb-walking or branchless conditional negate will diverge.
+// =============================================================================
+uint32_t reference_packed_digit(const uint64_t* scalar_data, size_t bit_offset, size_t window_bits)
+{
+    auto bit_at = [&](int64_t i) -> uint64_t {
+        if (i < 0 || static_cast<size_t>(i) >= MAX_BITS) {
+            return 0;
+        }
+        return (scalar_data[static_cast<size_t>(i) / LIMB_BITS_U64] >> (static_cast<size_t>(i) % LIMB_BITS_U64)) &
+               uint64_t{ 1 };
+    };
+    uint32_t raw = 0;
+    for (size_t k = 0; k <= window_bits; ++k) {
+        const int64_t bit_idx = static_cast<int64_t>(bit_offset) + static_cast<int64_t>(k) - 1;
+        raw |= static_cast<uint32_t>(bit_at(bit_idx)) << k;
+    }
+    const uint32_t neg = (raw >> window_bits) & 1U;
+    const uint32_t val_mask = (uint32_t{ 1 } << window_bits) - 1;
+    const uint32_t encode = (raw + 1) >> 1;
+    const uint32_t bucket = ((encode - neg) ^ (uint32_t{ 0 } - neg)) & val_mask;
+    return (neg << 31) | bucket;
+}
+
+// Random non-Montgomery scalar — uniform over [0, modulus). We invoke the
+// recoder against the raw limbs so the random_element form is irrelevant; what
+// matters is that the limb bytes are arbitrary.
+std::array<uint64_t, NUM_LIMBS_U64> random_scalar_limbs()
+{
+    std::array<uint64_t, NUM_LIMBS_U64> out{};
+    for (size_t i = 0; i < NUM_LIMBS_U64; ++i) {
+        out[i] = engine.get_random_uint64();
+    }
+    return out;
+}
+
+// View the same scalar as a uint32 limb array (little-endian: x86/ARM/WASM all
+// agree). The SIMD x4 helpers index by uint32 limbs.
+const uint32_t* as_u32(const std::array<uint64_t, NUM_LIMBS_U64>& s)
+{
+    return reinterpret_cast<const uint32_t*>(s.data());
+}
+
+// Drive `get_constantine_packed_digit` via the params returned by
+// `compute_constantine_slice_params`. The hot loop in Stage 1 / Stage 4 unpacks
+// the struct into scalar values; we mirror that call shape exactly so a future
+// API change here would be caught.
+uint32_t production_scalar_path(const uint64_t* scalar_data, size_t bit_offset, size_t window_bits)
+{
+    const auto sp = cnst::compute_constantine_slice_params(bit_offset, window_bits, NUM_LIMBS_U64);
+    return cnst::get_constantine_packed_digit(scalar_data,
+                                              sp.lo_limb,
+                                              sp.hi_limb,
+                                              sp.lo_off,
+                                              sp.lo_bits,
+                                              sp.lo_mask,
+                                              sp.hi_mask,
+                                              sp.slice_localised_to_one_u64,
+                                              window_bits);
+}
+
+// Drive the 4-wide SIMD specialisations by classifying the slice path and
+// calling the matching `store_constantine_packed_digits_x4_*` helper. Out[i]
+// is the packed digit for the i-th scalar. Mirrors Stage 1's per-window
+// dispatch loop in `scalar_multiplication.cpp`.
+void production_simd_path(const std::array<uint64_t, NUM_LIMBS_U64> scalars[4],
+                          size_t bit_offset,
+                          size_t window_bits,
+                          uint32_t out[4])
+{
+    const auto sp = cnst::compute_constantine_slice_params_u32(bit_offset, window_bits, NUM_LIMBS_U32);
+    const cnst::SimdU32x4 lo_mask_v{ sp.lo_mask, sp.lo_mask, sp.lo_mask, sp.lo_mask };
+    const cnst::SimdU32x4 hi_mask_v{ sp.hi_mask, sp.hi_mask, sp.hi_mask, sp.hi_mask };
+    const cnst::SimdU32x4 one_v{ 1, 1, 1, 1 };
+    const uint32_t val_mask_scalar = (uint32_t{ 1 } << window_bits) - 1;
+    const cnst::SimdU32x4 val_mask{ val_mask_scalar, val_mask_scalar, val_mask_scalar, val_mask_scalar };
+
+    const uint32_t* s0 = as_u32(scalars[0]);
+    const uint32_t* s1 = as_u32(scalars[1]);
+    const uint32_t* s2 = as_u32(scalars[2]);
+    const uint32_t* s3 = as_u32(scalars[3]);
+
+    const uint32_t wb_u32 = static_cast<uint32_t>(window_bits);
+    switch (cnst::classify_slice_path_u32(sp)) {
+    case cnst::ConstantineSlicePath::Localised:
+        cnst::store_constantine_packed_digits_x4_localised(
+            out, s0, s1, s2, s3, sp.lo_limb, sp.lo_off, lo_mask_v, one_v, val_mask, wb_u32);
+        break;
+    case cnst::ConstantineSlicePath::Bottom:
+        cnst::store_constantine_packed_digits_x4_bottom(
+            out, s0, s1, s2, s3, sp.hi_limb, sp.lo_bits, hi_mask_v, one_v, val_mask, wb_u32);
+        break;
+    case cnst::ConstantineSlicePath::Boundary:
+        cnst::store_constantine_packed_digits_x4_boundary(out,
+                                                          s0,
+                                                          s1,
+                                                          s2,
+                                                          s3,
+                                                          sp.lo_limb,
+                                                          sp.hi_limb,
+                                                          sp.lo_off,
+                                                          sp.lo_bits,
+                                                          lo_mask_v,
+                                                          hi_mask_v,
+                                                          one_v,
+                                                          val_mask,
+                                                          wb_u32);
+        break;
+    }
+}
+
+} // namespace
+
+// =============================================================================
+// Test 1 — Scalar packed-digit recoder matches the textbook reference oracle
+// across all `(window_bits, bit_offset)` pairs the live pipeline ever issues.
+// =============================================================================
+TEST(PippengerConstantine, ScalarMatchesReferenceOracleAllWindowBits)
+{
+    constexpr size_t TRIALS_PER_SHAPE = 32;
+    // window_bits range covers production: choose_window_bits returns 2..19,
+    // build_var_window_schedule's final window can additionally be 1 bit wide
+    // (e.g. wb=3 over 256 bits yields 85*3 + 1). bit_offset 255 covers the
+    // above-modulus top edge where every read bit is structurally zero.
+    for (size_t window_bits = 1; window_bits <= 19; ++window_bits) {
+        for (size_t bit_offset = 0; bit_offset <= 255; ++bit_offset) {
+            for (size_t t = 0; t < TRIALS_PER_SHAPE; ++t) {
+                const auto s = random_scalar_limbs();
+                const uint32_t got = production_scalar_path(s.data(), bit_offset, window_bits);
+                const uint32_t want = reference_packed_digit(s.data(), bit_offset, window_bits);
+                ASSERT_EQ(got, want) << "window_bits=" << window_bits << " bit_offset=" << bit_offset << " trial=" << t;
+            }
+        }
+    }
+}
+
+// =============================================================================
+// Test 2 — SIMD x4 path agrees with the scalar path lane-by-lane across all
+// three specialisations (Localised / Bottom / Boundary). Each bit_offset
+// implicitly selects which specialisation runs; we sweep every offset so all
+// three are exercised.
+// =============================================================================
+TEST(PippengerConstantine, SimdX4MatchesScalarPathLanewise)
+{
+    constexpr size_t TRIALS_PER_SHAPE = 16;
+    bool saw_localised = false;
+    bool saw_bottom = false;
+    bool saw_boundary = false;
+    // window_bits range covers production: choose_window_bits returns 2..19,
+    // build_var_window_schedule's final window can additionally be 1 bit wide
+    // (e.g. wb=3 over 256 bits yields 85*3 + 1). bit_offset 255 covers the
+    // above-modulus top edge where every read bit is structurally zero.
+    for (size_t window_bits = 1; window_bits <= 19; ++window_bits) {
+        for (size_t bit_offset = 0; bit_offset <= 255; ++bit_offset) {
+            const auto sp_u32 = cnst::compute_constantine_slice_params_u32(bit_offset, window_bits, NUM_LIMBS_U32);
+            switch (cnst::classify_slice_path_u32(sp_u32)) {
+            case cnst::ConstantineSlicePath::Localised:
+                saw_localised = true;
+                break;
+            case cnst::ConstantineSlicePath::Bottom:
+                saw_bottom = true;
+                break;
+            case cnst::ConstantineSlicePath::Boundary:
+                saw_boundary = true;
+                break;
+            }
+            for (size_t t = 0; t < TRIALS_PER_SHAPE; ++t) {
+                std::array<std::array<uint64_t, NUM_LIMBS_U64>, 4> scalars{
+                    random_scalar_limbs(), random_scalar_limbs(), random_scalar_limbs(), random_scalar_limbs()
+                };
+                std::array<uint32_t, 4> got_simd{};
+                production_simd_path(scalars.data(), bit_offset, window_bits, got_simd.data());
+                for (size_t lane = 0; lane < 4; ++lane) {
+                    const uint32_t want = production_scalar_path(scalars[lane].data(), bit_offset, window_bits);
+                    ASSERT_EQ(got_simd[lane], want)
+                        << "window_bits=" << window_bits << " bit_offset=" << bit_offset << " lane=" << lane;
+                }
+            }
+        }
+    }
+    // The sweep must exercise all three specialisations or the SIMD coverage is
+    // a no-op for a path. (Coverage check, not a behavioural claim.)
+    EXPECT_TRUE(saw_localised);
+    EXPECT_TRUE(saw_bottom);
+    EXPECT_TRUE(saw_boundary);
+}
+
+// =============================================================================
+// Test 3 — Round-trip identity. For any tiled window schedule covering
+// [0, total_bits) bits, the sum `Σ_w (-1)^sign_w · bucket_w · 2^{B_w}` must
+// equal the scalar value modulo 2^total_bits. This is the load-bearing
+// algebraic invariant the whole MSM rests on; if it ever fails the rest of
+// the pipeline silently mis-computes the result.
+// =============================================================================
+TEST(PippengerConstantine, RoundTripIdentityMatchesScalarMod2N)
+{
+    constexpr size_t TOTAL_BITS = 254;
+    constexpr size_t TRIALS = 64;
+    // Including window_bits == 1 because `build_var_window_schedule` truncates
+    // the final window to whatever bits remain, which can be exactly 1.
+    for (size_t window_bits = 1; window_bits <= 19; ++window_bits) {
+        for (size_t t = 0; t < TRIALS; ++t) {
+            const auto s = random_scalar_limbs();
+            // Recover scalar value as a 256-bit big integer (4 × uint64).
+            // We reconstruct it limb-by-limb using __int128 arithmetic so the
+            // round-trip is plainly readable; production code uses field
+            // arithmetic, which we deliberately avoid here.
+            //
+            // Tile windows of width `window_bits` until we cover TOTAL_BITS+2
+            // bits. The +2 mirrors the `total_bits = num_bits + 2` budget used
+            // by `build_var_window_schedule` to absorb the carry-less top bit.
+            std::vector<std::pair<int32_t, size_t>> signed_digits; // (signed_value, bit_offset)
+            size_t bit_offset = 0;
+            size_t bits_remaining = TOTAL_BITS + 2;
+            while (bits_remaining > 0) {
+                const size_t wb = std::min(window_bits, bits_remaining);
+                const uint32_t packed = production_scalar_path(s.data(), bit_offset, wb);
+                const uint32_t neg = packed >> 31;
+                const uint32_t bucket = packed & ((uint32_t{ 1 } << wb) - 1);
+                const int32_t signed_val = (neg != 0U) ? -static_cast<int32_t>(bucket) : static_cast<int32_t>(bucket);
+                signed_digits.emplace_back(signed_val, bit_offset);
+                bit_offset += wb;
+                bits_remaining -= wb;
+            }
+
+            // Reconstruct: Σ_w signed_val_w · 2^{bit_offset_w} mod 2^256, using
+            // uint256_t arithmetic where signed subtraction is just `acc -= |v| << off`.
+            bb::numeric::uint256_t acc(0);
+            for (const auto& [v, off] : signed_digits) {
+                const bb::numeric::uint256_t shifted = bb::numeric::uint256_t(static_cast<uint64_t>(v < 0 ? -v : v))
+                                                       << bb::numeric::uint256_t(off);
+                if (v < 0) {
+                    acc -= shifted;
+                } else {
+                    acc += shifted;
+                }
+            }
+            const bb::numeric::uint256_t scalar_val(s[0], s[1], s[2], s[3]);
+            EXPECT_EQ(acc, scalar_val) << "window_bits=" << window_bits << " trial=" << t;
+        }
+    }
+}
+
+// =============================================================================
+// Test 4 — Edge cases. Pin the structural boundaries explicitly so a regression
+// at one of them (rather than at a random bit) shows up as a named failure.
+// =============================================================================
+TEST(PippengerConstantine, EdgeCases)
+{
+    // (a) Zero scalar — every packed digit must be 0 (sign 0, bucket 0).
+    // Sweep includes wb=1 (final-window truncation) and bit_offset=255
+    // (above-modulus top edge — every bit read is structurally zero).
+    std::array<uint64_t, NUM_LIMBS_U64> zero{};
+    for (size_t wb = 1; wb <= 19; ++wb) {
+        for (size_t off = 0; off <= 255; ++off) {
+            EXPECT_EQ(production_scalar_path(zero.data(), off, wb), uint32_t{ 0 })
+                << "zero scalar wb=" << wb << " off=" << off;
+        }
+    }
+
+    // (b) Bottom window — bit_offset == 0 must select the synthetic-zero
+    // lookback path. The classifier flags it via `is_bottom_window`.
+    const auto sp_bottom = cnst::compute_constantine_slice_params_u32(0, 12, NUM_LIMBS_U32);
+    EXPECT_TRUE(sp_bottom.is_bottom_window);
+    EXPECT_EQ(cnst::classify_slice_path_u32(sp_bottom), cnst::ConstantineSlicePath::Bottom);
+
+    // (c) Top window — when the natural hi_limb read lands past the scalar's
+    // storage, the production code clamps `hi_limb` and zeros `hi_mask`. The
+    // packed digit must still match the reference oracle (which extends with
+    // zeros above bit 256). Sweep all the way to bit_offset=255 to cover the
+    // above-modulus case where every read bit is structurally zero.
+    auto top_aligned = random_scalar_limbs();
+    constexpr size_t window_bits = 12;
+    for (size_t bit_offset = 240; bit_offset <= 255; ++bit_offset) {
+        const uint32_t got = production_scalar_path(top_aligned.data(), bit_offset, window_bits);
+        const uint32_t want = reference_packed_digit(top_aligned.data(), bit_offset, window_bits);
+        EXPECT_EQ(got, want) << "top window bit_offset=" << bit_offset;
+    }
+
+    // (d) Localised fast path — the c+1-bit window must fit inside a single
+    // uint64 limb for the localised path to be selected. With window_bits=12
+    // and bit_offset=10, the lookback bit is at limb 0, bit 9; the window
+    // spans bits 10..21 — all inside limb 0, so localised path fires.
+    const auto sp_local = cnst::compute_constantine_slice_params(10, 12, NUM_LIMBS_U64);
+    EXPECT_TRUE(sp_local.slice_localised_to_one_u64);
+
+    // (e) Boundary case — when the window straddles a uint64 boundary the
+    // localised flag must be false. With window_bits=12 and bit_offset=60,
+    // the window spans bits 59..71 → crosses bit 63→64.
+    const auto sp_boundary = cnst::compute_constantine_slice_params(60, 12, NUM_LIMBS_U64);
+    EXPECT_FALSE(sp_boundary.slice_localised_to_one_u64);
+}
+
+// =============================================================================
+// Test 5 — Named slice-shape table. Random sweeps probably hit every
+// (limb_index, slice_path) combination, but a regression at one of these
+// boundaries (e.g. "boundary across bit 31→32, lookback in lo half") shows up
+// as a named failure here rather than an opaque "trial 17 of 32" log line.
+//
+// `bit_offset` here is the absolute bit position of the FIRST window bit; the
+// lookback bit lives at `bit_offset - 1`. Each row pins the (bit_offset, wb)
+// pair, the expected slice path under u32 indexing, and the expected
+// localisation under u64 indexing.
+// =============================================================================
+TEST(PippengerConstantine, NamedSliceShapes)
+{
+    struct ShapeCase {
+        const char* name;
+        size_t bit_offset;
+        size_t window_bits;
+        cnst::ConstantineSlicePath u32_path;
+        bool u64_localised; // expected `slice_localised_to_one_u64`
+    };
+    // Picked so each row exercises a structurally distinct shape:
+    //   - bottom_*    : synthetic-lookback path
+    //   - local_*     : c+1 bits fit inside a single u64 limb (and matching u32)
+    //   - boundary_*  : window straddles a u64 or u32 limb boundary
+    //   - top_clamped : hi_limb would land past scalar storage → clamp + zero mask
+    const std::array<ShapeCase, 12> cases{ {
+        // Bottom — bit_offset 0 across several wb.
+        { "bottom_wb12", 0, 12, cnst::ConstantineSlicePath::Bottom, false },
+        { "bottom_wb2", 0, 2, cnst::ConstantineSlicePath::Bottom, false },
+        { "bottom_wb19", 0, 19, cnst::ConstantineSlicePath::Bottom, false },
+        // Localised — lookback + window inside a single u32 (and therefore a single u64).
+        { "local_lo_u32", 10, 12, cnst::ConstantineSlicePath::Localised, true },
+        // Localised in u64 but boundary in u32 — lookback at bit 30 (u32 limb 0), window spans bits 30..42
+        // (crosses u32 bit 31→32) but stays inside u64 limb 0.
+        { "local_u64_boundary_u32", 31, 12, cnst::ConstantineSlicePath::Boundary, true },
+        // Boundary across u64 bit 63→64.
+        { "boundary_u64_at_63", 60, 12, cnst::ConstantineSlicePath::Boundary, false },
+        { "boundary_u64_at_127", 124, 12, cnst::ConstantineSlicePath::Boundary, false },
+        { "boundary_u64_at_191", 188, 12, cnst::ConstantineSlicePath::Boundary, false },
+        // Boundary at u32 bit 31→32 with lookback in low half.
+        { "boundary_u32_at_31", 30, 4, cnst::ConstantineSlicePath::Boundary, true },
+        // Top window — clamp regime. With wb=12, bit_offset=246 reads bits 245..257; hi limb is past
+        // the scalar's 256-bit storage in u32 view (limb_index 7 is the last).
+        { "top_clamped_wb12", 246, 12, cnst::ConstantineSlicePath::Boundary, false },
+        // wb=1 at the very top — the final-window case `build_var_window_schedule` can emit.
+        { "top_wb1_final", 254, 1, cnst::ConstantineSlicePath::Localised, true },
+        // Random mid-scalar localised case as a "happy path" anchor.
+        { "local_mid_u64", 80, 12, cnst::ConstantineSlicePath::Localised, true },
+    } };
+
+    auto s = random_scalar_limbs();
+    for (const auto& c : cases) {
+        const auto sp_u32 = cnst::compute_constantine_slice_params_u32(c.bit_offset, c.window_bits, NUM_LIMBS_U32);
+        const auto sp_u64 = cnst::compute_constantine_slice_params(c.bit_offset, c.window_bits, NUM_LIMBS_U64);
+        EXPECT_EQ(cnst::classify_slice_path_u32(sp_u32), c.u32_path) << "case=" << c.name;
+        EXPECT_EQ(sp_u64.slice_localised_to_one_u64, c.u64_localised) << "case=" << c.name;
+
+        // The encoder must still produce the reference value at each named shape.
+        const uint32_t got = production_scalar_path(s.data(), c.bit_offset, c.window_bits);
+        const uint32_t want = reference_packed_digit(s.data(), c.bit_offset, c.window_bits);
+        EXPECT_EQ(got, want) << "case=" << c.name;
+    }
+}
+
+// =============================================================================
+// Test 6 — u64 / u32 param classifier internal consistency.
+//
+// The scalar path uses `ConstantineSliceParams` (u64-indexed); the SIMD path
+// uses `ConstantineSliceParamsU32` (u32-indexed). Comparing final packed
+// digits (Test 1+2) catches END-to-END divergence, but a compensating bug
+// across the two param computations could mask itself. This test asserts the
+// param structs encode the SAME lookback bit position and the SAME read width
+// where their definitions agree, so a bug in one classifier alone shows up
+// even if the digits happen to round-trip.
+// =============================================================================
+TEST(PippengerConstantine, ParamClassifierU64U32Consistency)
+{
+    for (size_t wb = 1; wb <= 19; ++wb) {
+        for (size_t bit_offset = 0; bit_offset <= 255; ++bit_offset) {
+            const auto sp_u64 = cnst::compute_constantine_slice_params(bit_offset, wb, NUM_LIMBS_U64);
+            const auto sp_u32 = cnst::compute_constantine_slice_params_u32(bit_offset, wb, NUM_LIMBS_U32);
+
+            // Bottom-window classification: both must agree (u64 signals via lo_mask==0,
+            // u32 via the explicit is_bottom_window flag).
+            const bool u64_says_bottom = (sp_u64.lo_mask == 0);
+            EXPECT_EQ(u64_says_bottom, sp_u32.is_bottom_window)
+                << "bottom classification disagrees at bit_offset=" << bit_offset << " wb=" << wb;
+
+            // Lookback bit absolute position: lo_limb·LIMB_BITS + lo_off. Both views must
+            // identify the same absolute bit (skip bottom, where the lookback is synthetic
+            // and the limb/offset encoding is intentionally not a real position).
+            if (!sp_u32.is_bottom_window) {
+                const size_t u64_lookback = sp_u64.lo_limb * 64 + sp_u64.lo_off;
+                const size_t u32_lookback = sp_u32.lo_limb * 32 + sp_u32.lo_off;
+                EXPECT_EQ(u64_lookback, u32_lookback)
+                    << "lookback bit disagrees at bit_offset=" << bit_offset << " wb=" << wb;
+                EXPECT_EQ(u64_lookback, bit_offset - 1)
+                    << "lookback bit ≠ bit_offset-1 at bit_offset=" << bit_offset << " wb=" << wb;
+            }
+
+            // Localised-flag implication: u64-localised means the whole c+1 window lives in
+            // one u64 limb. That does NOT imply u32-localised (window could still straddle
+            // a u32 boundary inside the same u64), but it DOES imply the u32 view's slice
+            // path is NOT Bottom (bit_offset > 0 cases only).
+            if (sp_u64.slice_localised_to_one_u64 && bit_offset > 0) {
+                EXPECT_NE(cnst::classify_slice_path_u32(sp_u32), cnst::ConstantineSlicePath::Bottom)
+                    << "u64-localised but u32 classifier says Bottom at bit_offset=" << bit_offset << " wb=" << wb;
+            }
+        }
+    }
+}

From 43a1a1b7bc31bf5f891a15d400903d6ee05ec24f Mon Sep 17 00:00:00 2001
From: sergei iakovenko <105737703+iakovenkos@users.noreply.github.com>
Date: Tue, 26 May 2026 16:31:20 +0200
Subject: [PATCH 2/2] fix: revert extract Constantine signed-Booth window
 recoder (#23561)

Reverts d76a3163082 which was pushed directly to
merge-train/barretenberg without a PR. Will re-land via proper PR from
`si/pippenger-constantine`.
---
 .../pippenger_constantine.fuzzer.cpp          | 160 -------
 .../pippenger_constantine.hpp                 | 389 ---------------
 .../pippenger_constantine.test.cpp            | 443 ------------------
 3 files changed, 992 deletions(-)
 delete mode 100644 barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.fuzzer.cpp
 delete mode 100644 barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.hpp
 delete mode 100644 barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.test.cpp

diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.fuzzer.cpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.fuzzer.cpp
deleted file mode 100644
index 451a9baa31b9..000000000000
--- a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.fuzzer.cpp
+++ /dev/null
@@ -1,160 +0,0 @@
-// libFuzzer target for the Constantine signed-Booth window recoder.
-//
-// Two-pronged differential check on each input:
-//   1. Scalar path vs textbook reference oracle — catches encoder algebra bugs.
-//   2. SIMD x4 path vs scalar path (lane-by-lane) — catches lane-mux / mask /
-//      vector-shift bugs in the three slice-path specialisations.
-//
-// Input layout: 1 byte window_bits ∈ [2, 18], 1 byte bit_offset ∈ [0, 254],
-// followed by 32 bytes × 4 = 128 bytes of scalar limb material. Total minimum
-// input = 130 bytes; smaller inputs are zero-padded so libFuzzer's empty-seed
-// kickoff still drives the encoder.
-//
-// Run:
-//   cmake --preset fuzzing && cmake --build --preset fuzzing --target ecc_pippenger_constantine_fuzzer
-//   ./build-fuzzing/bin/ecc_pippenger_constantine_fuzzer -max_total_time=60
-
-#include "pippenger_constantine.hpp"
-
-#include "barretenberg/numeric/uint256/uint256.hpp"
-
-#include <array>
-#include <cstdint>
-#include <cstring>
-
-namespace {
-
-namespace cnst = bb::scalar_multiplication::round_parallel_detail;
-
-constexpr size_t LIMB_BITS_U64 = 64;
-constexpr size_t NUM_LIMBS_U64 = 4;
-constexpr size_t NUM_LIMBS_U32 = 8;
-constexpr size_t MAX_BITS = 256;
-constexpr size_t SCALAR_BYTES = 32;
-
-uint32_t reference_packed_digit(const uint64_t* scalar_data, size_t bit_offset, size_t window_bits)
-{
-    auto bit_at = [&](int64_t i) -> uint64_t {
-        if (i < 0 || static_cast<size_t>(i) >= MAX_BITS) {
-            return 0;
-        }
-        return (scalar_data[static_cast<size_t>(i) / LIMB_BITS_U64] >> (static_cast<size_t>(i) % LIMB_BITS_U64)) &
-               uint64_t{ 1 };
-    };
-    uint32_t raw = 0;
-    for (size_t k = 0; k <= window_bits; ++k) {
-        const int64_t bit_idx = static_cast<int64_t>(bit_offset) + static_cast<int64_t>(k) - 1;
-        raw |= static_cast<uint32_t>(bit_at(bit_idx)) << k;
-    }
-    const uint32_t neg = (raw >> window_bits) & 1U;
-    const uint32_t val_mask = (uint32_t{ 1 } << window_bits) - 1;
-    const uint32_t encode = (raw + 1) >> 1;
-    const uint32_t bucket = ((encode - neg) ^ (uint32_t{ 0 } - neg)) & val_mask;
-    return (neg << 31) | bucket;
-}
-
-uint32_t production_scalar(const uint64_t* scalar_data, size_t bit_offset, size_t window_bits)
-{
-    const auto sp = cnst::compute_constantine_slice_params(bit_offset, window_bits, NUM_LIMBS_U64);
-    return cnst::get_constantine_packed_digit(scalar_data,
-                                              sp.lo_limb,
-                                              sp.hi_limb,
-                                              sp.lo_off,
-                                              sp.lo_bits,
-                                              sp.lo_mask,
-                                              sp.hi_mask,
-                                              sp.slice_localised_to_one_u64,
-                                              window_bits);
-}
-
-void production_simd(const std::array<std::array<uint64_t, NUM_LIMBS_U64>, 4>& scalars,
-                     size_t bit_offset,
-                     size_t window_bits,
-                     std::array<uint32_t, 4>& out)
-{
-    const auto sp = cnst::compute_constantine_slice_params_u32(bit_offset, window_bits, NUM_LIMBS_U32);
-    const cnst::SimdU32x4 lo_mask_v{ sp.lo_mask, sp.lo_mask, sp.lo_mask, sp.lo_mask };
-    const cnst::SimdU32x4 hi_mask_v{ sp.hi_mask, sp.hi_mask, sp.hi_mask, sp.hi_mask };
-    const cnst::SimdU32x4 one_v{ 1, 1, 1, 1 };
-    const uint32_t val_mask_scalar = (uint32_t{ 1 } << window_bits) - 1;
-    const cnst::SimdU32x4 val_mask{ val_mask_scalar, val_mask_scalar, val_mask_scalar, val_mask_scalar };
-    const auto* s0 = reinterpret_cast<const uint32_t*>(scalars[0].data());
-    const auto* s1 = reinterpret_cast<const uint32_t*>(scalars[1].data());
-    const auto* s2 = reinterpret_cast<const uint32_t*>(scalars[2].data());
-    const auto* s3 = reinterpret_cast<const uint32_t*>(scalars[3].data());
-    const auto wb_u32 = static_cast<uint32_t>(window_bits);
-
-    switch (cnst::classify_slice_path_u32(sp)) {
-    case cnst::ConstantineSlicePath::Localised:
-        cnst::store_constantine_packed_digits_x4_localised(
-            out.data(), s0, s1, s2, s3, sp.lo_limb, sp.lo_off, lo_mask_v, one_v, val_mask, wb_u32);
-        break;
-    case cnst::ConstantineSlicePath::Bottom:
-        cnst::store_constantine_packed_digits_x4_bottom(
-            out.data(), s0, s1, s2, s3, sp.hi_limb, sp.lo_bits, hi_mask_v, one_v, val_mask, wb_u32);
-        break;
-    case cnst::ConstantineSlicePath::Boundary:
-        cnst::store_constantine_packed_digits_x4_boundary(out.data(),
-                                                          s0,
-                                                          s1,
-                                                          s2,
-                                                          s3,
-                                                          sp.lo_limb,
-                                                          sp.hi_limb,
-                                                          sp.lo_off,
-                                                          sp.lo_bits,
-                                                          lo_mask_v,
-                                                          hi_mask_v,
-                                                          one_v,
-                                                          val_mask,
-                                                          wb_u32);
-        break;
-    }
-}
-
-} // namespace
-
-extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size)
-{
-    // Pad input to the minimum required length so empty / tiny seeds still
-    // exercise the encoder against zero-extended scalars.
-    constexpr size_t MIN_INPUT = 2 + (SCALAR_BYTES * 4);
-    std::array<uint8_t, MIN_INPUT> buf{};
-    std::memcpy(buf.data(), data, std::min(size, MIN_INPUT));
-
-    // window_bits ∈ [1, 19] — `choose_window_bits` returns [2,19]; the final
-    // window emitted by `build_var_window_schedule` can additionally be 1 bit
-    // (e.g. wb=3 over 256 bits = 85*3+1). Outside this range the encoder has
-    // no well-defined behavior in production.
-    const size_t window_bits = 1 + (buf[0] % 19);
-    // bit_offset ∈ [0, 255] — the live pipeline's range, including the top
-    // edge where bit_offset+wb extends past the scalar's 256 bits (production
-    // code clamps `hi_limb` and zeros `hi_mask`).
-    const size_t bit_offset = buf[1] & 0xff;
-
-    std::array<std::array<uint64_t, NUM_LIMBS_U64>, 4> scalars{};
-    for (size_t lane = 0; lane < 4; ++lane) {
-        std::memcpy(scalars[lane].data(), buf.data() + 2 + (lane * SCALAR_BYTES), SCALAR_BYTES);
-    }
-
-    // Check 1: scalar path matches the textbook reference oracle.
-    for (size_t lane = 0; lane < 4; ++lane) {
-        const uint32_t got = production_scalar(scalars[lane].data(), bit_offset, window_bits);
-        const uint32_t want = reference_packed_digit(scalars[lane].data(), bit_offset, window_bits);
-        if (got != want) {
-            __builtin_trap();
-        }
-    }
-
-    // Check 2: SIMD x4 path agrees with scalar path lane-by-lane.
-    std::array<uint32_t, 4> simd_out{};
-    production_simd(scalars, bit_offset, window_bits, simd_out);
-    for (size_t lane = 0; lane < 4; ++lane) {
-        const uint32_t want = production_scalar(scalars[lane].data(), bit_offset, window_bits);
-        if (simd_out[lane] != want) {
-            __builtin_trap();
-        }
-    }
-
-    return 0;
-}
diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.hpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.hpp
deleted file mode 100644
index ec2c3e6800b2..000000000000
--- a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.hpp
+++ /dev/null
@@ -1,389 +0,0 @@
-// Constantine-style signed-Booth window recoder for Pippenger MSM.
-//
-// Given a scalar s = sum_i s_i 2^i and a window [b, b + c), this module computes a
-// signed digit d in [-(2^c - 1), 2^c - 1] such that the scalar can be reconstructed as
-// s = sum_w d_w 2^{b_w}. It returns d as a packed `(sign | bucket)` value, where
-// `bucket = |d|` and `sign` records whether d is negative.
-//
-// Implements the carry-less `signedWindowEncoding` / `getSignedFullWindowAt` pattern from
-// `constantine/math/arithmetic/bigints.nim`: each window reads c+1 bits including the
-// previous window boundary bit, lets that shared boundary bit substitute for an explicit
-// carry, and produces a `(sign | bucket)` packed digit.
-//
-// Assumptions: production callers pass `window_bits` in [1, 19] and bit offsets within a
-// 256-bit scalar. The bit-twiddling below assumes `window_bits < 32`.
-//
-// Two parallel paths:
-//   * scalar path  — `ConstantineSliceParams` + `get_constantine_packed_digit` (uint64-
-//     indexed limbs).
-//   * SIMD x4 path — `ConstantineSliceParamsU32` + `store_constantine_packed_digits_x4_*`
-//     (uint32-indexed limbs, processes 4 scalars per call via GCC vector_size).
-//
-// The SIMD helpers split on slice-path (Localised / Bottom / Boundary) so the per-window
-// branch is hoisted out of the per-scalar loop. `classify_slice_path_u32` returns the
-// matching enum for callers to dispatch on once per window.
-
-#pragma once
-
-#include <cstddef>
-#include <cstdint>
-
-#ifdef __wasm_simd128__
-#include <wasm_simd128.h>
-#endif
-
-namespace bb::scalar_multiplication::round_parallel_detail {
-
-/**
- * @brief Per-window precomputed slice parameters for the carry-less signed-Booth window
- *        recoding (after Constantine `signedWindowEncoding` / `getSignedFullWindowAt`,
- *        `constantine/math/arithmetic/bigints.nim`). Computed once per window by the
- *        caller; the per-scalar hot path is then fixed bit-twiddling with no per-iteration
- *        slice address arithmetic.
- *        Carry-less because every non-bottom window's c+1-bit read shares its boundary bit
- *        with the previous window — the bit a non-overlapping recoder would carry.
- *
- * `slice_localised_to_one_u64`: true iff every bit of the c+1-bit window lives inside a
- * single uint64 limb. Most windows on typical 254-bit scalars with c in [12, 19]
- * (lookback bits at non-boundary positions) hit this and take the fast path: one load,
- * one shift, one mask. The slow path is the boundary-straddling case + the synthetic-
- * lookback bottom window.
- */
-struct ConstantineSliceParams {
-    uint32_t lo_mask;
-    uint32_t hi_mask;
-    uint32_t lo_limb;
-    uint32_t hi_limb; // == lo_limb + 1, except clamped to last valid limb at the top window
-    uint32_t lo_off;
-    uint32_t lo_bits;
-    bool slice_localised_to_one_u64;
-};
-
-/**
- * @brief Compute the Constantine slice params for a window starting at absolute bit position
- *        `bit_offset` (= Σ_{k<w} window_bits_k under variable-window, or w·window_bits under
- *        uniform-window). The slice is `[bit_offset - 1, bit_offset + window_bits)`; the bit at
- *        bit_offset - 1 is the shared boundary bit. The bottom window (bit_offset == 0) is
- *        encoded specially so the same recoding algebra applies.
- */
-[[nodiscard]] inline ConstantineSliceParams compute_constantine_slice_params(size_t bit_offset,
-                                                                             size_t window_bits,
-                                                                             size_t num_uint64_limbs) noexcept
-{
-    constexpr size_t LIMB_BITS = 64;
-    ConstantineSliceParams sp;
-    if (bit_offset == 0) {
-        // Bottom window: the boundary bit below the LSB is a synthetic 0. Encode this by
-        // reading "limb -1" as a zero-masked load (lo_mask = 0), then reading window_bits
-        // bits from limb 0 into the hi side and shifting them left by 1. This puts the
-        // window_bits-bit window at bits 1..window_bits with bit 0 = 0, matching the inner-
-        // loop body used by every other window. Not localised — the synthetic-lookback
-        // assembly only works in the slow path.
-        sp.lo_limb = 0; // safe in-range, but masked to 0
-        sp.hi_limb = 0; // = scalar limb 0
-        sp.lo_off = LIMB_BITS - 1;
-        sp.lo_bits = 1; // shifts hi_part left by 1, planting the window_bits-bit window at bits 1..window_bits
-        sp.lo_mask = 0; // lo_part contributes nothing
-        sp.hi_mask = (uint32_t{ 1 } << window_bits) - 1;
-        sp.slice_localised_to_one_u64 = false;
-    } else {
-        const size_t lookback_bit = bit_offset - 1;
-        const size_t bits_to_read = window_bits + 1;
-        sp.lo_limb = static_cast<uint32_t>(lookback_bit / LIMB_BITS);
-        sp.lo_off = static_cast<uint32_t>(lookback_bit & (LIMB_BITS - 1));
-        sp.lo_bits = static_cast<uint32_t>(LIMB_BITS - sp.lo_off < bits_to_read ? LIMB_BITS - sp.lo_off : bits_to_read);
-        const uint32_t hi_bits = static_cast<uint32_t>(bits_to_read) - sp.lo_bits;
-        // window_bits+1 ≤ 32 for our windows ⇒ lo_bits ≤ 32 ⇒ mask fits in uint32.
-        sp.lo_mask = (uint32_t{ 1 } << sp.lo_bits) - 1;
-        // If the natural hi-limb read would land past the end of the scalar's storage,
-        // clamp `hi_limb` to a safe in-range index and mask its contribution to zero. The
-        // top window's hi_bits worth of bits are conceptually zero (scalar < 2^num_bits ≤
-        // num_windows·window_bits). Re-reading lo_limb under a zero mask keeps the slow
-        // path's two unconditional limb loads branch-free.
-        if (static_cast<size_t>(sp.lo_limb) + 1 >= num_uint64_limbs) {
-            sp.hi_limb = sp.lo_limb;
-            sp.hi_mask = 0;
-        } else {
-            sp.hi_limb = sp.lo_limb + 1;
-            sp.hi_mask = (uint32_t{ 1 } << hi_bits) - 1;
-        }
-        // Fast path: the full (window_bits+1)-bit window lives inside `lo_limb`. hi_bits == 0
-        // captures both the in-limb case (window doesn't straddle a 64-bit boundary) and the
-        // clamped top-window case (above) where hi_mask was forced to 0.
-        sp.slice_localised_to_one_u64 = (hi_bits == 0);
-    }
-    return sp;
-}
-
-/**
- * @brief Read (window_bits+1) bits from `scalar_data` (uint64 limbs) using precomputed
- *        slice params and apply Constantine's signedWindowEncoding to produce a
- *        `(sign | bucket)` packed digit.
- *
- *        Takes the slice params as scalar value parameters rather than a struct reference
- *        so the compiler can keep them in registers across the caller loop.
- *
- *        `slice_localised_to_one_u64` selects the single-load fast path: ~75% of windows
- *        on typical 254-bit scalars (window_bits in [12, 19]) hit this.
- */
-[[nodiscard]] [[gnu::always_inline]] inline uint32_t get_constantine_packed_digit(const uint64_t* scalar_data,
-                                                                                  uint32_t lo_limb,
-                                                                                  uint32_t hi_limb,
-                                                                                  uint32_t lo_off,
-                                                                                  uint32_t lo_bits,
-                                                                                  uint32_t lo_mask,
-                                                                                  uint32_t hi_mask,
-                                                                                  bool slice_localised_to_one_u64,
-                                                                                  size_t window_bits) noexcept
-{
-    uint64_t raw_wide = 0;
-    if (slice_localised_to_one_u64) {
-        // Fast path: one load + shift + mask. hi_part vanishes (hi_mask == 0); skip it.
-        raw_wide = (scalar_data[lo_limb] >> lo_off) & lo_mask;
-    } else if (lo_mask == 0) {
-        // Bottom-window fast path: synthetic-zero lookback bit, so the lo_part contribution is
-        // always 0 (lo_mask == 0). Skip the lo limb load entirely. lo_bits == 1 here, so the
-        // shift plants the window_bits-bit slice at bits 1..window_bits with bit 0 = 0.
-        // sp_lo_mask is loop-invariant within a window but is a runtime stack value, so the
-        // compiler does NOT constant-fold the `(s_lo >> lo_off) & 0 = 0` path inside the
-        // boundary branch; this explicit check saves ~3 ALU ops per scalar on the bottom window.
-        raw_wide = (scalar_data[hi_limb] & hi_mask) << lo_bits;
-    } else {
-        // Slow path: window straddles a uint64 boundary.
-        const uint64_t s_lo = scalar_data[lo_limb];
-        const uint64_t s_hi = scalar_data[hi_limb];
-        const uint64_t lo_part = (s_lo >> lo_off) & lo_mask;
-        const uint64_t hi_part = (s_hi & hi_mask) << lo_bits;
-        raw_wide = lo_part | hi_part;
-    }
-    // raw fits in window_bits+1 ≤ 32 bits, safe to narrow.
-    const uint32_t raw = static_cast<uint32_t>(raw_wide);
-
-    // signedWindowEncoding(raw, window_bits). raw fits in window_bits+1 bits; bit
-    // `window_bits` is the sign indicator.
-    //
-    // The conditional-negate trick `((encode + neg_mask) ^ neg_mask)` is the standard
-    // branchless idiom. We use the equivalent `(encode - neg) ^ neg_mask` to break the
-    // latency chain: `encode - neg` and `neg_mask = -neg` can issue in parallel (both
-    // depend only on `neg` / `encode`), whereas `encode + neg_mask` first waits for
-    // `neg_mask` to materialise. Saves one cycle on the inner-loop critical path
-    // (neg → neg_mask → +neg_mask → ^neg_mask → &val_mask vs neg → {neg_mask, enc_neg}
-    // in parallel → ^neg_mask → &val_mask). Identical result by:
-    //   neg=0: enc_neg = encode, xored = encode ^ 0 = encode. ✓
-    //   neg=1: enc_neg = encode−1, xored = (encode−1) ^ −1 = ~(encode−1) = −encode. ✓
-    const uint32_t neg = (raw >> window_bits) & uint32_t{ 1 };
-    const uint32_t neg_mask = uint32_t{ 0 } - neg; // 0 or 0xFFFFFFFF
-    const uint32_t val_mask = (uint32_t{ 1 } << window_bits) - 1;
-    const uint32_t encode = (raw + 1) >> 1;
-    const uint32_t bucket_idx = ((encode - neg) ^ neg_mask) & val_mask;
-
-    // Pack into (sign | bucket): sign in bit 31, bucket magnitude in the low bits.
-    return (neg << 31) | bucket_idx;
-}
-
-// 128-bit SIMD-friendly 4-wide variant of get_constantine_packed_digit. Computes 4 packed
-// digits in parallel via GCC's vector_size extension, which lowers to native SIMD on x86
-// (SSE2), ARM (NEON), and WASM (wasm-simd128). The branch on slice path is hoisted from
-// the per-call site to the per-window outer loop, so callers select the localised / bottom /
-// boundary specialisation once per window.
-//
-// We index the scalar via a `const uint32_t*` view rather than the natural `uint64_t*`:
-// each lane is one uint32, so a 128-bit SIMD register holds 4 (raw, encode, bucket, …)
-// values. `scalar.data` is a `std::array<uint64_t, 4>` whose byte layout is identical to
-// `uint32_t[8]` on every target we ship to (x86 / ARM / WASM are all little-endian, and the
-// codebase already assumes this layout in many places — `from_montgomery`, `uint256_t`,
-// etc.). The reinterpret_cast is the same alias pattern.
-//
-// Returns the four packed digits in `out[0..3]`. The caller scatters them individually,
-// since the consuming writes are not vectorisable. Switching from 2-wide uint64 to 4-wide
-// uint32 doubles the compute throughput per SIMD instruction at the cost of slightly more
-// straddle hits.
-using SimdU32x4 = uint32_t __attribute__((vector_size(16)));
-
-// Helpers return `SimdU32x4` directly so the v128 stays in the SIMD register file end-to-end.
-// Wrapping in a 4-uint32 struct round-tripped the v128 through 4 scalar memory slots.
-
-// uint32-indexed Constantine slice params, mirroring `ConstantineSliceParams` but with
-// limb indices measured in 32-bit (rather than 64-bit) chunks. Computed once per window in
-// `compute_constantine_slice_params_u32`; consumed by the SIMD x4 helpers below.
-struct ConstantineSliceParamsU32 {
-    uint32_t lo_mask;
-    uint32_t hi_mask;
-    uint32_t lo_limb; // u32 limb index of the lookback bit
-    uint32_t hi_limb; // == lo_limb + 1, clamped to last in-range u32 limb at the top window
-    uint32_t lo_off;  // bit-offset of the lookback bit within `lo_limb`
-    uint32_t lo_bits; // # bits read from `lo_limb` (also acts as the hi_part left-shift amount)
-    bool slice_localised_to_one_u32;
-    bool is_bottom_window;
-};
-
-[[nodiscard]] inline ConstantineSliceParamsU32 compute_constantine_slice_params_u32(size_t bit_offset,
-                                                                                    size_t window_bits,
-                                                                                    size_t num_u32_limbs) noexcept
-{
-    constexpr size_t LIMB_BITS_U32 = 32;
-    ConstantineSliceParamsU32 sp;
-    if (bit_offset == 0) {
-        sp.lo_limb = 0;
-        sp.hi_limb = 0;
-        sp.lo_off = LIMB_BITS_U32 - 1;
-        sp.lo_bits = 1;
-        sp.lo_mask = 0;
-        sp.hi_mask = (uint32_t{ 1 } << window_bits) - 1;
-        sp.slice_localised_to_one_u32 = false;
-        sp.is_bottom_window = true;
-    } else {
-        const size_t lookback_bit = bit_offset - 1;
-        const size_t bits_to_read = window_bits + 1;
-        sp.lo_limb = static_cast<uint32_t>(lookback_bit / LIMB_BITS_U32);
-        sp.lo_off = static_cast<uint32_t>(lookback_bit & (LIMB_BITS_U32 - 1));
-        const uint32_t in_lo = static_cast<uint32_t>(LIMB_BITS_U32 - sp.lo_off);
-        sp.lo_bits = (in_lo < static_cast<uint32_t>(bits_to_read)) ? in_lo : static_cast<uint32_t>(bits_to_read);
-        const uint32_t hi_bits = static_cast<uint32_t>(bits_to_read) - sp.lo_bits;
-        sp.lo_mask = (sp.lo_bits == LIMB_BITS_U32) ? ~uint32_t{ 0 } : ((uint32_t{ 1 } << sp.lo_bits) - 1);
-        if (static_cast<size_t>(sp.lo_limb) + 1 >= num_u32_limbs) {
-            sp.hi_limb = sp.lo_limb;
-            sp.hi_mask = 0;
-        } else {
-            sp.hi_limb = sp.lo_limb + 1;
-            sp.hi_mask = (uint32_t{ 1 } << hi_bits) - 1;
-        }
-        sp.slice_localised_to_one_u32 = (hi_bits == 0);
-        sp.is_bottom_window = false;
-    }
-    return sp;
-}
-
-// Gather 4 disjoint uint32 values into one v128 via wasm v128.load32_lane. On WASM this
-// is 1 splat + 3 load32_lane (4 ops); brace-init `{a, b, c, d}` with runtime values emits
-// 4 scalar i32.load + 1 splat + 3 replace_lane (8 ops). On native it falls back to brace-
-// init which clang lowers to NEON ins / SSE2 pinsrd.
-[[nodiscard]] [[gnu::always_inline]] inline SimdU32x4 gather_x4_u32(
-    const uint32_t* p0, const uint32_t* p1, const uint32_t* p2, const uint32_t* p3, uint32_t idx) noexcept
-{
-#ifdef __wasm_simd128__
-    v128_t v = wasm_i32x4_splat(0);
-    v = wasm_v128_load32_lane(p0 + idx, v, 0);
-    v = wasm_v128_load32_lane(p1 + idx, v, 1);
-    v = wasm_v128_load32_lane(p2 + idx, v, 2);
-    v = wasm_v128_load32_lane(p3 + idx, v, 3);
-    return reinterpret_cast<SimdU32x4>(v);
-#else
-    return SimdU32x4{ p0[idx], p1[idx], p2[idx], p3[idx] };
-#endif
-}
-
-// Store a `SimdU32x4` to a 4-lane uint32 destination as a single 128-bit op.
-// On WASM the explicit `wasm_v128_store` is used because earlier codegen for
-// the equivalent struct-wrapper assignment was observed to round-trip the
-// vector through 4 scalar memory slots; the intrinsic guarantees the
-// `i32x4.store` opcode. On native the `vector_size` store lowers directly to
-// SSE2 `movdqu` / NEON `st1`.
-[[gnu::always_inline]] inline void simd_u32x4_store(uint32_t* dst, SimdU32x4 v) noexcept
-{
-#ifdef __wasm_simd128__
-    wasm_v128_store(dst, reinterpret_cast<v128_t>(v));
-#else
-    *reinterpret_cast<SimdU32x4*>(dst) = v;
-#endif
-}
-
-// All four mask / constant v128s (lo_mask_v, hi_mask_v, one_v, val_mask) are loop-invariant
-// within a window. Callers build them ONCE per window in the outer-w loop and pass them in,
-// so the inner-i compute loop has zero v128.const / splat / shl+sub for the masks.
-// `neg_mask = -neg` uses GCC vector-ext unary minus which lowers to `i32x4.neg` on WASM.
-//
-// Helpers write the v128 result directly into the caller-provided 4-lane destination buffer.
-[[gnu::always_inline]] inline void store_constantine_packed_digits_x4_localised(uint32_t* dst,
-                                                                                const uint32_t* scalar_data_0,
-                                                                                const uint32_t* scalar_data_1,
-                                                                                const uint32_t* scalar_data_2,
-                                                                                const uint32_t* scalar_data_3,
-                                                                                uint32_t lo_limb,
-                                                                                uint32_t lo_off,
-                                                                                SimdU32x4 lo_mask_v,
-                                                                                SimdU32x4 one_v,
-                                                                                SimdU32x4 val_mask,
-                                                                                uint32_t window_bits) noexcept
-{
-    const SimdU32x4 lo = gather_x4_u32(scalar_data_0, scalar_data_1, scalar_data_2, scalar_data_3, lo_limb);
-    const SimdU32x4 raw = (lo >> lo_off) & lo_mask_v;
-    const SimdU32x4 neg = (raw >> window_bits) & one_v;
-    const SimdU32x4 neg_mask = -neg;
-    const SimdU32x4 encode = (raw + one_v) >> 1;
-    const SimdU32x4 bucket = ((encode - neg) ^ neg_mask) & val_mask;
-    const SimdU32x4 packed = (neg << 31) | bucket;
-    simd_u32x4_store(dst, packed);
-}
-
-[[gnu::always_inline]] inline void store_constantine_packed_digits_x4_bottom(uint32_t* dst,
-                                                                             const uint32_t* scalar_data_0,
-                                                                             const uint32_t* scalar_data_1,
-                                                                             const uint32_t* scalar_data_2,
-                                                                             const uint32_t* scalar_data_3,
-                                                                             uint32_t hi_limb,
-                                                                             uint32_t lo_bits,
-                                                                             SimdU32x4 hi_mask_v,
-                                                                             SimdU32x4 one_v,
-                                                                             SimdU32x4 val_mask,
-                                                                             uint32_t window_bits) noexcept
-{
-    const SimdU32x4 hi = gather_x4_u32(scalar_data_0, scalar_data_1, scalar_data_2, scalar_data_3, hi_limb);
-    const SimdU32x4 raw = (hi & hi_mask_v) << lo_bits;
-    const SimdU32x4 neg = (raw >> window_bits) & one_v;
-    const SimdU32x4 neg_mask = -neg;
-    const SimdU32x4 encode = (raw + one_v) >> 1;
-    const SimdU32x4 bucket = ((encode - neg) ^ neg_mask) & val_mask;
-    const SimdU32x4 packed = (neg << 31) | bucket;
-    simd_u32x4_store(dst, packed);
-}
-
-[[gnu::always_inline]] inline void store_constantine_packed_digits_x4_boundary(uint32_t* dst,
-                                                                               const uint32_t* scalar_data_0,
-                                                                               const uint32_t* scalar_data_1,
-                                                                               const uint32_t* scalar_data_2,
-                                                                               const uint32_t* scalar_data_3,
-                                                                               uint32_t lo_limb,
-                                                                               uint32_t hi_limb,
-                                                                               uint32_t lo_off,
-                                                                               uint32_t lo_bits,
-                                                                               SimdU32x4 lo_mask_v,
-                                                                               SimdU32x4 hi_mask_v,
-                                                                               SimdU32x4 one_v,
-                                                                               SimdU32x4 val_mask,
-                                                                               uint32_t window_bits) noexcept
-{
-    const SimdU32x4 lo = gather_x4_u32(scalar_data_0, scalar_data_1, scalar_data_2, scalar_data_3, lo_limb);
-    const SimdU32x4 hi = gather_x4_u32(scalar_data_0, scalar_data_1, scalar_data_2, scalar_data_3, hi_limb);
-    const SimdU32x4 lo_part = (lo >> lo_off) & lo_mask_v;
-    const SimdU32x4 hi_part = (hi & hi_mask_v) << lo_bits;
-    const SimdU32x4 raw = lo_part | hi_part;
-    const SimdU32x4 neg = (raw >> window_bits) & one_v;
-    const SimdU32x4 neg_mask = -neg;
-    const SimdU32x4 encode = (raw + one_v) >> 1;
-    const SimdU32x4 bucket = ((encode - neg) ^ neg_mask) & val_mask;
-    const SimdU32x4 packed = (neg << 31) | bucket;
-    simd_u32x4_store(dst, packed);
-}
-
-// Path-selector enum used to dispatch on the SIMD specialisation once per window rather
-// than once per scalar.
-enum class ConstantineSlicePath : uint8_t {
-    Localised = 0,
-    Bottom = 1,
-    Boundary = 2,
-};
-
-[[nodiscard]] [[gnu::always_inline]] inline ConstantineSlicePath classify_slice_path_u32(
-    const ConstantineSliceParamsU32& sp) noexcept
-{
-    if (sp.is_bottom_window) {
-        return ConstantineSlicePath::Bottom;
-    }
-    if (sp.slice_localised_to_one_u32) {
-        return ConstantineSlicePath::Localised;
-    }
-    return ConstantineSlicePath::Boundary;
-}
-
-} // namespace bb::scalar_multiplication::round_parallel_detail
diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.test.cpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.test.cpp
deleted file mode 100644
index 199166832bc7..000000000000
--- a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.test.cpp
+++ /dev/null
@@ -1,443 +0,0 @@
-// Unit tests for the Constantine signed-Booth window recoder used by the
-// round-parallel Pippenger MSM. Validates the scalar packed-digit recoder,
-// the SIMD x4 specialisations (Localised / Bottom / Boundary), and the
-// round-trip identity `Σ_w (-1)^sign_w · bucket_w · 2^{B_w} ≡ scalar`.
-
-#include "pippenger_constantine.hpp"
-
-#include "barretenberg/ecc/curves/bn254/fr.hpp"
-#include "barretenberg/numeric/random/engine.hpp"
-#include "barretenberg/numeric/uint256/uint256.hpp"
-
-#include <array>
-#include <cstdint>
-#include <gtest/gtest.h>
-#include <vector>
-
-namespace {
-
-namespace cnst = bb::scalar_multiplication::round_parallel_detail;
-using ScalarField = bb::fr;
-auto& engine = bb::numeric::get_randomness();
-
-constexpr size_t LIMB_BITS_U64 = 64;
-constexpr size_t NUM_LIMBS_U64 = 4;
-constexpr size_t NUM_LIMBS_U32 = 8;
-constexpr size_t MAX_BITS = 256;
-
-// =============================================================================
-// Reference signed-window encoder. Reads `(window_bits + 1)` bits from the
-// scalar starting at `bit_offset - 1` (with a synthetic 0 at bit -1 when
-// bit_offset == 0), then applies the signed-Booth encode:
-//
-//   raw  = bits [bit_offset-1, bit_offset + window_bits)
-//   neg  = raw >> window_bits           (top bit = sign indicator)
-//   encode = (raw + 1) >> 1             (drop the lookback bit)
-//   bucket = (encode - neg) ^ (-neg)    (conditional negate, branchless)
-//   packed = (neg << 31) | bucket
-//
-// Same algebra as `get_constantine_packed_digit`, but implemented in the most
-// obvious way against a flat `bit_at(i)` accessor so any error in the
-// production path's limb-walking or branchless conditional negate will diverge.
-// =============================================================================
-uint32_t reference_packed_digit(const uint64_t* scalar_data, size_t bit_offset, size_t window_bits)
-{
-    auto bit_at = [&](int64_t i) -> uint64_t {
-        if (i < 0 || static_cast<size_t>(i) >= MAX_BITS) {
-            return 0;
-        }
-        return (scalar_data[static_cast<size_t>(i) / LIMB_BITS_U64] >> (static_cast<size_t>(i) % LIMB_BITS_U64)) &
-               uint64_t{ 1 };
-    };
-    uint32_t raw = 0;
-    for (size_t k = 0; k <= window_bits; ++k) {
-        const int64_t bit_idx = static_cast<int64_t>(bit_offset) + static_cast<int64_t>(k) - 1;
-        raw |= static_cast<uint32_t>(bit_at(bit_idx)) << k;
-    }
-    const uint32_t neg = (raw >> window_bits) & 1U;
-    const uint32_t val_mask = (uint32_t{ 1 } << window_bits) - 1;
-    const uint32_t encode = (raw + 1) >> 1;
-    const uint32_t bucket = ((encode - neg) ^ (uint32_t{ 0 } - neg)) & val_mask;
-    return (neg << 31) | bucket;
-}
-
-// Random non-Montgomery scalar — uniform over [0, modulus). We invoke the
-// recoder against the raw limbs so the random_element form is irrelevant; what
-// matters is that the limb bytes are arbitrary.
-std::array<uint64_t, NUM_LIMBS_U64> random_scalar_limbs()
-{
-    std::array<uint64_t, NUM_LIMBS_U64> out{};
-    for (size_t i = 0; i < NUM_LIMBS_U64; ++i) {
-        out[i] = engine.get_random_uint64();
-    }
-    return out;
-}
-
-// View the same scalar as a uint32 limb array (little-endian: x86/ARM/WASM all
-// agree). The SIMD x4 helpers index by uint32 limbs.
-const uint32_t* as_u32(const std::array<uint64_t, NUM_LIMBS_U64>& s)
-{
-    return reinterpret_cast<const uint32_t*>(s.data());
-}
-
-// Drive `get_constantine_packed_digit` via the params returned by
-// `compute_constantine_slice_params`. The hot loop in Stage 1 / Stage 4 unpacks
-// the struct into scalar values; we mirror that call shape exactly so a future
-// API change here would be caught.
-uint32_t production_scalar_path(const uint64_t* scalar_data, size_t bit_offset, size_t window_bits)
-{
-    const auto sp = cnst::compute_constantine_slice_params(bit_offset, window_bits, NUM_LIMBS_U64);
-    return cnst::get_constantine_packed_digit(scalar_data,
-                                              sp.lo_limb,
-                                              sp.hi_limb,
-                                              sp.lo_off,
-                                              sp.lo_bits,
-                                              sp.lo_mask,
-                                              sp.hi_mask,
-                                              sp.slice_localised_to_one_u64,
-                                              window_bits);
-}
-
-// Drive the 4-wide SIMD specialisations by classifying the slice path and
-// calling the matching `store_constantine_packed_digits_x4_*` helper. Out[i]
-// is the packed digit for the i-th scalar. Mirrors Stage 1's per-window
-// dispatch loop in `scalar_multiplication.cpp`.
-void production_simd_path(const std::array<uint64_t, NUM_LIMBS_U64> scalars[4],
-                          size_t bit_offset,
-                          size_t window_bits,
-                          uint32_t out[4])
-{
-    const auto sp = cnst::compute_constantine_slice_params_u32(bit_offset, window_bits, NUM_LIMBS_U32);
-    const cnst::SimdU32x4 lo_mask_v{ sp.lo_mask, sp.lo_mask, sp.lo_mask, sp.lo_mask };
-    const cnst::SimdU32x4 hi_mask_v{ sp.hi_mask, sp.hi_mask, sp.hi_mask, sp.hi_mask };
-    const cnst::SimdU32x4 one_v{ 1, 1, 1, 1 };
-    const uint32_t val_mask_scalar = (uint32_t{ 1 } << window_bits) - 1;
-    const cnst::SimdU32x4 val_mask{ val_mask_scalar, val_mask_scalar, val_mask_scalar, val_mask_scalar };
-
-    const uint32_t* s0 = as_u32(scalars[0]);
-    const uint32_t* s1 = as_u32(scalars[1]);
-    const uint32_t* s2 = as_u32(scalars[2]);
-    const uint32_t* s3 = as_u32(scalars[3]);
-
-    const uint32_t wb_u32 = static_cast<uint32_t>(window_bits);
-    switch (cnst::classify_slice_path_u32(sp)) {
-    case cnst::ConstantineSlicePath::Localised:
-        cnst::store_constantine_packed_digits_x4_localised(
-            out, s0, s1, s2, s3, sp.lo_limb, sp.lo_off, lo_mask_v, one_v, val_mask, wb_u32);
-        break;
-    case cnst::ConstantineSlicePath::Bottom:
-        cnst::store_constantine_packed_digits_x4_bottom(
-            out, s0, s1, s2, s3, sp.hi_limb, sp.lo_bits, hi_mask_v, one_v, val_mask, wb_u32);
-        break;
-    case cnst::ConstantineSlicePath::Boundary:
-        cnst::store_constantine_packed_digits_x4_boundary(out,
-                                                          s0,
-                                                          s1,
-                                                          s2,
-                                                          s3,
-                                                          sp.lo_limb,
-                                                          sp.hi_limb,
-                                                          sp.lo_off,
-                                                          sp.lo_bits,
-                                                          lo_mask_v,
-                                                          hi_mask_v,
-                                                          one_v,
-                                                          val_mask,
-                                                          wb_u32);
-        break;
-    }
-}
-
-} // namespace
-
-// =============================================================================
-// Test 1 — Scalar packed-digit recoder matches the textbook reference oracle
-// across all `(window_bits, bit_offset)` pairs the live pipeline ever issues.
-// =============================================================================
-TEST(PippengerConstantine, ScalarMatchesReferenceOracleAllWindowBits)
-{
-    constexpr size_t TRIALS_PER_SHAPE = 32;
-    // window_bits range covers production: choose_window_bits returns 2..19,
-    // build_var_window_schedule's final window can additionally be 1 bit wide
-    // (e.g. wb=3 over 256 bits yields 85*3 + 1). bit_offset 255 covers the
-    // above-modulus top edge where every read bit is structurally zero.
-    for (size_t window_bits = 1; window_bits <= 19; ++window_bits) {
-        for (size_t bit_offset = 0; bit_offset <= 255; ++bit_offset) {
-            for (size_t t = 0; t < TRIALS_PER_SHAPE; ++t) {
-                const auto s = random_scalar_limbs();
-                const uint32_t got = production_scalar_path(s.data(), bit_offset, window_bits);
-                const uint32_t want = reference_packed_digit(s.data(), bit_offset, window_bits);
-                ASSERT_EQ(got, want) << "window_bits=" << window_bits << " bit_offset=" << bit_offset << " trial=" << t;
-            }
-        }
-    }
-}
-
-// =============================================================================
-// Test 2 — SIMD x4 path agrees with the scalar path lane-by-lane across all
-// three specialisations (Localised / Bottom / Boundary). Each bit_offset
-// implicitly selects which specialisation runs; we sweep every offset so all
-// three are exercised.
-// =============================================================================
-TEST(PippengerConstantine, SimdX4MatchesScalarPathLanewise)
-{
-    constexpr size_t TRIALS_PER_SHAPE = 16;
-    bool saw_localised = false;
-    bool saw_bottom = false;
-    bool saw_boundary = false;
-    // window_bits range covers production: choose_window_bits returns 2..19,
-    // build_var_window_schedule's final window can additionally be 1 bit wide
-    // (e.g. wb=3 over 256 bits yields 85*3 + 1). bit_offset 255 covers the
-    // above-modulus top edge where every read bit is structurally zero.
-    for (size_t window_bits = 1; window_bits <= 19; ++window_bits) {
-        for (size_t bit_offset = 0; bit_offset <= 255; ++bit_offset) {
-            const auto sp_u32 = cnst::compute_constantine_slice_params_u32(bit_offset, window_bits, NUM_LIMBS_U32);
-            switch (cnst::classify_slice_path_u32(sp_u32)) {
-            case cnst::ConstantineSlicePath::Localised:
-                saw_localised = true;
-                break;
-            case cnst::ConstantineSlicePath::Bottom:
-                saw_bottom = true;
-                break;
-            case cnst::ConstantineSlicePath::Boundary:
-                saw_boundary = true;
-                break;
-            }
-            for (size_t t = 0; t < TRIALS_PER_SHAPE; ++t) {
-                std::array<std::array<uint64_t, NUM_LIMBS_U64>, 4> scalars{
-                    random_scalar_limbs(), random_scalar_limbs(), random_scalar_limbs(), random_scalar_limbs()
-                };
-                std::array<uint32_t, 4> got_simd{};
-                production_simd_path(scalars.data(), bit_offset, window_bits, got_simd.data());
-                for (size_t lane = 0; lane < 4; ++lane) {
-                    const uint32_t want = production_scalar_path(scalars[lane].data(), bit_offset, window_bits);
-                    ASSERT_EQ(got_simd[lane], want)
-                        << "window_bits=" << window_bits << " bit_offset=" << bit_offset << " lane=" << lane;
-                }
-            }
-        }
-    }
-    // The sweep must exercise all three specialisations or the SIMD coverage is
-    // a no-op for a path. (Coverage check, not a behavioural claim.)
-    EXPECT_TRUE(saw_localised);
-    EXPECT_TRUE(saw_bottom);
-    EXPECT_TRUE(saw_boundary);
-}
-
-// =============================================================================
-// Test 3 — Round-trip identity. For any tiled window schedule covering
-// [0, total_bits) bits, the sum `Σ_w (-1)^sign_w · bucket_w · 2^{B_w}` must
-// equal the scalar value modulo 2^total_bits. This is the load-bearing
-// algebraic invariant the whole MSM rests on; if it ever fails the rest of
-// the pipeline silently mis-computes the result.
-// =============================================================================
-TEST(PippengerConstantine, RoundTripIdentityMatchesScalarMod2N)
-{
-    constexpr size_t TOTAL_BITS = 254;
-    constexpr size_t TRIALS = 64;
-    // Including window_bits == 1 because `build_var_window_schedule` truncates
-    // the final window to whatever bits remain, which can be exactly 1.
-    for (size_t window_bits = 1; window_bits <= 19; ++window_bits) {
-        for (size_t t = 0; t < TRIALS; ++t) {
-            const auto s = random_scalar_limbs();
-            // Recover scalar value as a 256-bit big integer (4 × uint64).
-            // We reconstruct it limb-by-limb using __int128 arithmetic so the
-            // round-trip is plainly readable; production code uses field
-            // arithmetic, which we deliberately avoid here.
-            //
-            // Tile windows of width `window_bits` until we cover TOTAL_BITS+2
-            // bits. The +2 mirrors the `total_bits = num_bits + 2` budget used
-            // by `build_var_window_schedule` to absorb the carry-less top bit.
-            std::vector<std::pair<int32_t, size_t>> signed_digits; // (signed_value, bit_offset)
-            size_t bit_offset = 0;
-            size_t bits_remaining = TOTAL_BITS + 2;
-            while (bits_remaining > 0) {
-                const size_t wb = std::min(window_bits, bits_remaining);
-                const uint32_t packed = production_scalar_path(s.data(), bit_offset, wb);
-                const uint32_t neg = packed >> 31;
-                const uint32_t bucket = packed & ((uint32_t{ 1 } << wb) - 1);
-                const int32_t signed_val = (neg != 0U) ? -static_cast<int32_t>(bucket) : static_cast<int32_t>(bucket);
-                signed_digits.emplace_back(signed_val, bit_offset);
-                bit_offset += wb;
-                bits_remaining -= wb;
-            }
-
-            // Reconstruct: Σ_w signed_val_w · 2^{bit_offset_w} mod 2^256, using
-            // uint256_t arithmetic where signed subtraction is just `acc -= |v| << off`.
-            bb::numeric::uint256_t acc(0);
-            for (const auto& [v, off] : signed_digits) {
-                const bb::numeric::uint256_t shifted = bb::numeric::uint256_t(static_cast<uint64_t>(v < 0 ? -v : v))
-                                                       << bb::numeric::uint256_t(off);
-                if (v < 0) {
-                    acc -= shifted;
-                } else {
-                    acc += shifted;
-                }
-            }
-            const bb::numeric::uint256_t scalar_val(s[0], s[1], s[2], s[3]);
-            EXPECT_EQ(acc, scalar_val) << "window_bits=" << window_bits << " trial=" << t;
-        }
-    }
-}
-
-// =============================================================================
-// Test 4 — Edge cases. Pin the structural boundaries explicitly so a regression
-// at one of them (rather than at a random bit) shows up as a named failure.
-// =============================================================================
-TEST(PippengerConstantine, EdgeCases)
-{
-    // (a) Zero scalar — every packed digit must be 0 (sign 0, bucket 0).
-    // Sweep includes wb=1 (final-window truncation) and bit_offset=255
-    // (above-modulus top edge — every bit read is structurally zero).
-    std::array<uint64_t, NUM_LIMBS_U64> zero{};
-    for (size_t wb = 1; wb <= 19; ++wb) {
-        for (size_t off = 0; off <= 255; ++off) {
-            EXPECT_EQ(production_scalar_path(zero.data(), off, wb), uint32_t{ 0 })
-                << "zero scalar wb=" << wb << " off=" << off;
-        }
-    }
-
-    // (b) Bottom window — bit_offset == 0 must select the synthetic-zero
-    // lookback path. The classifier flags it via `is_bottom_window`.
-    const auto sp_bottom = cnst::compute_constantine_slice_params_u32(0, 12, NUM_LIMBS_U32);
-    EXPECT_TRUE(sp_bottom.is_bottom_window);
-    EXPECT_EQ(cnst::classify_slice_path_u32(sp_bottom), cnst::ConstantineSlicePath::Bottom);
-
-    // (c) Top window — when the natural hi_limb read lands past the scalar's
-    // storage, the production code clamps `hi_limb` and zeros `hi_mask`. The
-    // packed digit must still match the reference oracle (which extends with
-    // zeros above bit 256). Sweep all the way to bit_offset=255 to cover the
-    // above-modulus case where every read bit is structurally zero.
-    auto top_aligned = random_scalar_limbs();
-    constexpr size_t window_bits = 12;
-    for (size_t bit_offset = 240; bit_offset <= 255; ++bit_offset) {
-        const uint32_t got = production_scalar_path(top_aligned.data(), bit_offset, window_bits);
-        const uint32_t want = reference_packed_digit(top_aligned.data(), bit_offset, window_bits);
-        EXPECT_EQ(got, want) << "top window bit_offset=" << bit_offset;
-    }
-
-    // (d) Localised fast path — the c+1-bit window must fit inside a single
-    // uint64 limb for the localised path to be selected. With window_bits=12
-    // and bit_offset=10, the lookback bit is at limb 0, bit 9; the window
-    // spans bits 10..21 — all inside limb 0, so localised path fires.
-    const auto sp_local = cnst::compute_constantine_slice_params(10, 12, NUM_LIMBS_U64);
-    EXPECT_TRUE(sp_local.slice_localised_to_one_u64);
-
-    // (e) Boundary case — when the window straddles a uint64 boundary the
-    // localised flag must be false. With window_bits=12 and bit_offset=60,
-    // the window spans bits 59..71 → crosses bit 63→64.
-    const auto sp_boundary = cnst::compute_constantine_slice_params(60, 12, NUM_LIMBS_U64);
-    EXPECT_FALSE(sp_boundary.slice_localised_to_one_u64);
-}
-
-// =============================================================================
-// Test 5 — Named slice-shape table. Random sweeps probably hit every
-// (limb_index, slice_path) combination, but a regression at one of these
-// boundaries (e.g. "boundary across bit 31→32, lookback in lo half") shows up
-// as a named failure here rather than an opaque "trial 17 of 32" log line.
-//
-// `bit_offset` here is the absolute bit position of the FIRST window bit; the
-// lookback bit lives at `bit_offset - 1`. Each row pins the (bit_offset, wb)
-// pair, the expected slice path under u32 indexing, and the expected
-// localisation under u64 indexing.
-// =============================================================================
-TEST(PippengerConstantine, NamedSliceShapes)
-{
-    struct ShapeCase {
-        const char* name;
-        size_t bit_offset;
-        size_t window_bits;
-        cnst::ConstantineSlicePath u32_path;
-        bool u64_localised; // expected `slice_localised_to_one_u64`
-    };
-    // Picked so each row exercises a structurally distinct shape:
-    //   - bottom_*    : synthetic-lookback path
-    //   - local_*     : c+1 bits fit inside a single u64 limb (and matching u32)
-    //   - boundary_*  : window straddles a u64 or u32 limb boundary
-    //   - top_clamped : hi_limb would land past scalar storage → clamp + zero mask
-    const std::array<ShapeCase, 12> cases{ {
-        // Bottom — bit_offset 0 across several wb.
-        { "bottom_wb12", 0, 12, cnst::ConstantineSlicePath::Bottom, false },
-        { "bottom_wb2", 0, 2, cnst::ConstantineSlicePath::Bottom, false },
-        { "bottom_wb19", 0, 19, cnst::ConstantineSlicePath::Bottom, false },
-        // Localised — lookback + window inside a single u32 (and therefore a single u64).
-        { "local_lo_u32", 10, 12, cnst::ConstantineSlicePath::Localised, true },
-        // Localised in u64 but boundary in u32 — lookback at bit 30 (u32 limb 0), window spans bits 30..42
-        // (crosses u32 bit 31→32) but stays inside u64 limb 0.
-        { "local_u64_boundary_u32", 31, 12, cnst::ConstantineSlicePath::Boundary, true },
-        // Boundary across u64 bit 63→64.
-        { "boundary_u64_at_63", 60, 12, cnst::ConstantineSlicePath::Boundary, false },
-        { "boundary_u64_at_127", 124, 12, cnst::ConstantineSlicePath::Boundary, false },
-        { "boundary_u64_at_191", 188, 12, cnst::ConstantineSlicePath::Boundary, false },
-        // Boundary at u32 bit 31→32 with lookback in low half.
-        { "boundary_u32_at_31", 30, 4, cnst::ConstantineSlicePath::Boundary, true },
-        // Top window — clamp regime. With wb=12, bit_offset=246 reads bits 245..257; hi limb is past
-        // the scalar's 256-bit storage in u32 view (limb_index 7 is the last).
-        { "top_clamped_wb12", 246, 12, cnst::ConstantineSlicePath::Boundary, false },
-        // wb=1 at the very top — the final-window case `build_var_window_schedule` can emit.
-        { "top_wb1_final", 254, 1, cnst::ConstantineSlicePath::Localised, true },
-        // Random mid-scalar localised case as a "happy path" anchor.
-        { "local_mid_u64", 80, 12, cnst::ConstantineSlicePath::Localised, true },
-    } };
-
-    auto s = random_scalar_limbs();
-    for (const auto& c : cases) {
-        const auto sp_u32 = cnst::compute_constantine_slice_params_u32(c.bit_offset, c.window_bits, NUM_LIMBS_U32);
-        const auto sp_u64 = cnst::compute_constantine_slice_params(c.bit_offset, c.window_bits, NUM_LIMBS_U64);
-        EXPECT_EQ(cnst::classify_slice_path_u32(sp_u32), c.u32_path) << "case=" << c.name;
-        EXPECT_EQ(sp_u64.slice_localised_to_one_u64, c.u64_localised) << "case=" << c.name;
-
-        // The encoder must still produce the reference value at each named shape.
-        const uint32_t got = production_scalar_path(s.data(), c.bit_offset, c.window_bits);
-        const uint32_t want = reference_packed_digit(s.data(), c.bit_offset, c.window_bits);
-        EXPECT_EQ(got, want) << "case=" << c.name;
-    }
-}
-
-// =============================================================================
-// Test 6 — u64 / u32 param classifier internal consistency.
-//
-// The scalar path uses `ConstantineSliceParams` (u64-indexed); the SIMD path
-// uses `ConstantineSliceParamsU32` (u32-indexed). Comparing final packed
-// digits (Test 1+2) catches END-to-END divergence, but a compensating bug
-// across the two param computations could mask itself. This test asserts the
-// param structs encode the SAME lookback bit position and the SAME read width
-// where their definitions agree, so a bug in one classifier alone shows up
-// even if the digits happen to round-trip.
-// =============================================================================
-TEST(PippengerConstantine, ParamClassifierU64U32Consistency)
-{
-    for (size_t wb = 1; wb <= 19; ++wb) {
-        for (size_t bit_offset = 0; bit_offset <= 255; ++bit_offset) {
-            const auto sp_u64 = cnst::compute_constantine_slice_params(bit_offset, wb, NUM_LIMBS_U64);
-            const auto sp_u32 = cnst::compute_constantine_slice_params_u32(bit_offset, wb, NUM_LIMBS_U32);
-
-            // Bottom-window classification: both must agree (u64 signals via lo_mask==0,
-            // u32 via the explicit is_bottom_window flag).
-            const bool u64_says_bottom = (sp_u64.lo_mask == 0);
-            EXPECT_EQ(u64_says_bottom, sp_u32.is_bottom_window)
-                << "bottom classification disagrees at bit_offset=" << bit_offset << " wb=" << wb;
-
-            // Lookback bit absolute position: lo_limb·LIMB_BITS + lo_off. Both views must
-            // identify the same absolute bit (skip bottom, where the lookback is synthetic
-            // and the limb/offset encoding is intentionally not a real position).
-            if (!sp_u32.is_bottom_window) {
-                const size_t u64_lookback = sp_u64.lo_limb * 64 + sp_u64.lo_off;
-                const size_t u32_lookback = sp_u32.lo_limb * 32 + sp_u32.lo_off;
-                EXPECT_EQ(u64_lookback, u32_lookback)
-                    << "lookback bit disagrees at bit_offset=" << bit_offset << " wb=" << wb;
-                EXPECT_EQ(u64_lookback, bit_offset - 1)
-                    << "lookback bit ≠ bit_offset-1 at bit_offset=" << bit_offset << " wb=" << wb;
-            }
-
-            // Localised-flag implication: u64-localised means the whole c+1 window lives in
-            // one u64 limb. That does NOT imply u32-localised (window could still straddle
-            // a u32 boundary inside the same u64), but it DOES imply the u32 view's slice
-            // path is NOT Bottom (bit_offset > 0 cases only).
-            if (sp_u64.slice_localised_to_one_u64 && bit_offset > 0) {
-                EXPECT_NE(cnst::classify_slice_path_u32(sp_u32), cnst::ConstantineSlicePath::Bottom)
-                    << "u64-localised but u32 classifier says Bottom at bit_offset=" << bit_offset << " wb=" << wb;
-            }
-        }
-    }
-}