From d76a3163082f0400e0616d9e91ae77b9f76aaa63 Mon Sep 17 00:00:00 2001 From: iakovenkos Date: Tue, 26 May 2026 13:48:50 +0000 Subject: [PATCH 1/2] feat: extract Constantine signed-Booth window recoder with tests + fuzzer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds pippenger_constantine.hpp as a standalone primitive carved out of the upcoming round-parallel Pippenger MSM rewrite. Self-contained header in bb::scalar_multiplication::round_parallel_detail with no callers on this branch — landed first so the recoder gets reviewed and CI-gated as a mathematical primitive rather than buried inside the larger MSM PR. The recoder implements Constantine's signedWindowEncoding / getSignedFullWindowAt (constantine/math/arithmetic/bigints.nim): each window reads (c+1) bits including the previous window's top bit and lets that shared boundary bit substitute for an explicit carry, producing a (sign | bucket) packed digit. Two parallel families: * Scalar path — ConstantineSliceParams + get_constantine_packed_digit (uint64-indexed limbs) * SIMD x4 path — ConstantineSliceParamsU32 + three specialised stores (uint32-indexed limbs, GCC vector_size, 4 scalars/call) The SIMD helpers split on slice-path (Localised / Bottom / Boundary) so the per-window branch hoists out of the per-scalar inner loop. Tests (pippenger_constantine.test.cpp, ~48 ms native / ~46 ms WASM): 1. ScalarMatchesReferenceOracleAllWindowBits — production scalar path vs a textbook signed-window reference oracle. Sweeps window_bits in [1, 19] (covers the [2, 19] choose_window_bits range plus the wb=1 final window that build_var_window_schedule can emit) and bit_offset in [0, 255]. 2. SimdX4MatchesScalarPathLanewise — SIMD x4 ≡ scalar lane-by-lane across all three specialisations, with a coverage assertion that each fires. 3. RoundTripIdentityMatchesScalarMod2N — sum_w (-1)^{sign_w} · bucket_w · 2^{B_w} ≡ scalar (mod 2^256). The load-bearing algebraic invariant. 4. EdgeCases — zero scalar, bottom-window classifier, top-window limb clamp (incl. bit_offset 255), localised/boundary flag boundaries. 5. NamedSliceShapes — 12 named structural cases (bottom_wb*, local_*, boundary_u64_at_{63,127,191}, boundary_u32_at_31, top_clamped, top_wb1_final) so a regression at one shape shows up as a named failure. 6. ParamClassifierU64U32Consistency — asserts the u64 / u32 param structs encode the same lookback bit position and bottom-window classification, so a bug in one classifier alone is not masked by the final-digit oracle. Fuzzer (pippenger_constantine.fuzzer.cpp): Differential libFuzzer target — for each (window_bits, bit_offset, 4 random scalars) checks (a) scalar path == reference oracle, (b) SIMD x4 == scalar per lane. Verified clean under both `fuzzing` (30M iters / 30s) and `fuzzing-asan` (30M iters / 46s) presets with a boundary-biased seed corpus pinned to {0, 1, 31/32/33, 63/64/65, 127/128, 191/192, 253/254/255}. Minor follow-on cleanup: extracted simd_u32x4_store() helper to dedupe the three `#ifdef __wasm_simd128__` blocks across the SIMD specialisations. [[gnu::always_inline]] makes it post-inline byte-identical to the previous open-coded form. --- .../pippenger_constantine.fuzzer.cpp | 160 +++++++ .../pippenger_constantine.hpp | 389 +++++++++++++++ .../pippenger_constantine.test.cpp | 443 ++++++++++++++++++ 3 files changed, 992 insertions(+) create mode 100644 barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.fuzzer.cpp create mode 100644 barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.hpp create mode 100644 barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.test.cpp diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.fuzzer.cpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.fuzzer.cpp new file mode 100644 index 000000000000..451a9baa31b9 --- /dev/null +++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.fuzzer.cpp @@ -0,0 +1,160 @@ +// libFuzzer target for the Constantine signed-Booth window recoder. +// +// Two-pronged differential check on each input: +// 1. Scalar path vs textbook reference oracle — catches encoder algebra bugs. +// 2. SIMD x4 path vs scalar path (lane-by-lane) — catches lane-mux / mask / +// vector-shift bugs in the three slice-path specialisations. +// +// Input layout: 1 byte window_bits ∈ [2, 18], 1 byte bit_offset ∈ [0, 254], +// followed by 32 bytes × 4 = 128 bytes of scalar limb material. Total minimum +// input = 130 bytes; smaller inputs are zero-padded so libFuzzer's empty-seed +// kickoff still drives the encoder. +// +// Run: +// cmake --preset fuzzing && cmake --build --preset fuzzing --target ecc_pippenger_constantine_fuzzer +// ./build-fuzzing/bin/ecc_pippenger_constantine_fuzzer -max_total_time=60 + +#include "pippenger_constantine.hpp" + +#include "barretenberg/numeric/uint256/uint256.hpp" + +#include +#include +#include + +namespace { + +namespace cnst = bb::scalar_multiplication::round_parallel_detail; + +constexpr size_t LIMB_BITS_U64 = 64; +constexpr size_t NUM_LIMBS_U64 = 4; +constexpr size_t NUM_LIMBS_U32 = 8; +constexpr size_t MAX_BITS = 256; +constexpr size_t SCALAR_BYTES = 32; + +uint32_t reference_packed_digit(const uint64_t* scalar_data, size_t bit_offset, size_t window_bits) +{ + auto bit_at = [&](int64_t i) -> uint64_t { + if (i < 0 || static_cast(i) >= MAX_BITS) { + return 0; + } + return (scalar_data[static_cast(i) / LIMB_BITS_U64] >> (static_cast(i) % LIMB_BITS_U64)) & + uint64_t{ 1 }; + }; + uint32_t raw = 0; + for (size_t k = 0; k <= window_bits; ++k) { + const int64_t bit_idx = static_cast(bit_offset) + static_cast(k) - 1; + raw |= static_cast(bit_at(bit_idx)) << k; + } + const uint32_t neg = (raw >> window_bits) & 1U; + const uint32_t val_mask = (uint32_t{ 1 } << window_bits) - 1; + const uint32_t encode = (raw + 1) >> 1; + const uint32_t bucket = ((encode - neg) ^ (uint32_t{ 0 } - neg)) & val_mask; + return (neg << 31) | bucket; +} + +uint32_t production_scalar(const uint64_t* scalar_data, size_t bit_offset, size_t window_bits) +{ + const auto sp = cnst::compute_constantine_slice_params(bit_offset, window_bits, NUM_LIMBS_U64); + return cnst::get_constantine_packed_digit(scalar_data, + sp.lo_limb, + sp.hi_limb, + sp.lo_off, + sp.lo_bits, + sp.lo_mask, + sp.hi_mask, + sp.slice_localised_to_one_u64, + window_bits); +} + +void production_simd(const std::array, 4>& scalars, + size_t bit_offset, + size_t window_bits, + std::array& out) +{ + const auto sp = cnst::compute_constantine_slice_params_u32(bit_offset, window_bits, NUM_LIMBS_U32); + const cnst::SimdU32x4 lo_mask_v{ sp.lo_mask, sp.lo_mask, sp.lo_mask, sp.lo_mask }; + const cnst::SimdU32x4 hi_mask_v{ sp.hi_mask, sp.hi_mask, sp.hi_mask, sp.hi_mask }; + const cnst::SimdU32x4 one_v{ 1, 1, 1, 1 }; + const uint32_t val_mask_scalar = (uint32_t{ 1 } << window_bits) - 1; + const cnst::SimdU32x4 val_mask{ val_mask_scalar, val_mask_scalar, val_mask_scalar, val_mask_scalar }; + const auto* s0 = reinterpret_cast(scalars[0].data()); + const auto* s1 = reinterpret_cast(scalars[1].data()); + const auto* s2 = reinterpret_cast(scalars[2].data()); + const auto* s3 = reinterpret_cast(scalars[3].data()); + const auto wb_u32 = static_cast(window_bits); + + switch (cnst::classify_slice_path_u32(sp)) { + case cnst::ConstantineSlicePath::Localised: + cnst::store_constantine_packed_digits_x4_localised( + out.data(), s0, s1, s2, s3, sp.lo_limb, sp.lo_off, lo_mask_v, one_v, val_mask, wb_u32); + break; + case cnst::ConstantineSlicePath::Bottom: + cnst::store_constantine_packed_digits_x4_bottom( + out.data(), s0, s1, s2, s3, sp.hi_limb, sp.lo_bits, hi_mask_v, one_v, val_mask, wb_u32); + break; + case cnst::ConstantineSlicePath::Boundary: + cnst::store_constantine_packed_digits_x4_boundary(out.data(), + s0, + s1, + s2, + s3, + sp.lo_limb, + sp.hi_limb, + sp.lo_off, + sp.lo_bits, + lo_mask_v, + hi_mask_v, + one_v, + val_mask, + wb_u32); + break; + } +} + +} // namespace + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) +{ + // Pad input to the minimum required length so empty / tiny seeds still + // exercise the encoder against zero-extended scalars. + constexpr size_t MIN_INPUT = 2 + (SCALAR_BYTES * 4); + std::array buf{}; + std::memcpy(buf.data(), data, std::min(size, MIN_INPUT)); + + // window_bits ∈ [1, 19] — `choose_window_bits` returns [2,19]; the final + // window emitted by `build_var_window_schedule` can additionally be 1 bit + // (e.g. wb=3 over 256 bits = 85*3+1). Outside this range the encoder has + // no well-defined behavior in production. + const size_t window_bits = 1 + (buf[0] % 19); + // bit_offset ∈ [0, 255] — the live pipeline's range, including the top + // edge where bit_offset+wb extends past the scalar's 256 bits (production + // code clamps `hi_limb` and zeros `hi_mask`). + const size_t bit_offset = buf[1] & 0xff; + + std::array, 4> scalars{}; + for (size_t lane = 0; lane < 4; ++lane) { + std::memcpy(scalars[lane].data(), buf.data() + 2 + (lane * SCALAR_BYTES), SCALAR_BYTES); + } + + // Check 1: scalar path matches the textbook reference oracle. + for (size_t lane = 0; lane < 4; ++lane) { + const uint32_t got = production_scalar(scalars[lane].data(), bit_offset, window_bits); + const uint32_t want = reference_packed_digit(scalars[lane].data(), bit_offset, window_bits); + if (got != want) { + __builtin_trap(); + } + } + + // Check 2: SIMD x4 path agrees with scalar path lane-by-lane. + std::array simd_out{}; + production_simd(scalars, bit_offset, window_bits, simd_out); + for (size_t lane = 0; lane < 4; ++lane) { + const uint32_t want = production_scalar(scalars[lane].data(), bit_offset, window_bits); + if (simd_out[lane] != want) { + __builtin_trap(); + } + } + + return 0; +} diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.hpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.hpp new file mode 100644 index 000000000000..ec2c3e6800b2 --- /dev/null +++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.hpp @@ -0,0 +1,389 @@ +// Constantine-style signed-Booth window recoder for Pippenger MSM. +// +// Given a scalar s = sum_i s_i 2^i and a window [b, b + c), this module computes a +// signed digit d in [-(2^c - 1), 2^c - 1] such that the scalar can be reconstructed as +// s = sum_w d_w 2^{b_w}. It returns d as a packed `(sign | bucket)` value, where +// `bucket = |d|` and `sign` records whether d is negative. +// +// Implements the carry-less `signedWindowEncoding` / `getSignedFullWindowAt` pattern from +// `constantine/math/arithmetic/bigints.nim`: each window reads c+1 bits including the +// previous window boundary bit, lets that shared boundary bit substitute for an explicit +// carry, and produces a `(sign | bucket)` packed digit. +// +// Assumptions: production callers pass `window_bits` in [1, 19] and bit offsets within a +// 256-bit scalar. The bit-twiddling below assumes `window_bits < 32`. +// +// Two parallel paths: +// * scalar path — `ConstantineSliceParams` + `get_constantine_packed_digit` (uint64- +// indexed limbs). +// * SIMD x4 path — `ConstantineSliceParamsU32` + `store_constantine_packed_digits_x4_*` +// (uint32-indexed limbs, processes 4 scalars per call via GCC vector_size). +// +// The SIMD helpers split on slice-path (Localised / Bottom / Boundary) so the per-window +// branch is hoisted out of the per-scalar loop. `classify_slice_path_u32` returns the +// matching enum for callers to dispatch on once per window. + +#pragma once + +#include +#include + +#ifdef __wasm_simd128__ +#include +#endif + +namespace bb::scalar_multiplication::round_parallel_detail { + +/** + * @brief Per-window precomputed slice parameters for the carry-less signed-Booth window + * recoding (after Constantine `signedWindowEncoding` / `getSignedFullWindowAt`, + * `constantine/math/arithmetic/bigints.nim`). Computed once per window by the + * caller; the per-scalar hot path is then fixed bit-twiddling with no per-iteration + * slice address arithmetic. + * Carry-less because every non-bottom window's c+1-bit read shares its boundary bit + * with the previous window — the bit a non-overlapping recoder would carry. + * + * `slice_localised_to_one_u64`: true iff every bit of the c+1-bit window lives inside a + * single uint64 limb. Most windows on typical 254-bit scalars with c in [12, 19] + * (lookback bits at non-boundary positions) hit this and take the fast path: one load, + * one shift, one mask. The slow path is the boundary-straddling case + the synthetic- + * lookback bottom window. + */ +struct ConstantineSliceParams { + uint32_t lo_mask; + uint32_t hi_mask; + uint32_t lo_limb; + uint32_t hi_limb; // == lo_limb + 1, except clamped to last valid limb at the top window + uint32_t lo_off; + uint32_t lo_bits; + bool slice_localised_to_one_u64; +}; + +/** + * @brief Compute the Constantine slice params for a window starting at absolute bit position + * `bit_offset` (= Σ_{k(lookback_bit / LIMB_BITS); + sp.lo_off = static_cast(lookback_bit & (LIMB_BITS - 1)); + sp.lo_bits = static_cast(LIMB_BITS - sp.lo_off < bits_to_read ? LIMB_BITS - sp.lo_off : bits_to_read); + const uint32_t hi_bits = static_cast(bits_to_read) - sp.lo_bits; + // window_bits+1 ≤ 32 for our windows ⇒ lo_bits ≤ 32 ⇒ mask fits in uint32. + sp.lo_mask = (uint32_t{ 1 } << sp.lo_bits) - 1; + // If the natural hi-limb read would land past the end of the scalar's storage, + // clamp `hi_limb` to a safe in-range index and mask its contribution to zero. The + // top window's hi_bits worth of bits are conceptually zero (scalar < 2^num_bits ≤ + // num_windows·window_bits). Re-reading lo_limb under a zero mask keeps the slow + // path's two unconditional limb loads branch-free. + if (static_cast(sp.lo_limb) + 1 >= num_uint64_limbs) { + sp.hi_limb = sp.lo_limb; + sp.hi_mask = 0; + } else { + sp.hi_limb = sp.lo_limb + 1; + sp.hi_mask = (uint32_t{ 1 } << hi_bits) - 1; + } + // Fast path: the full (window_bits+1)-bit window lives inside `lo_limb`. hi_bits == 0 + // captures both the in-limb case (window doesn't straddle a 64-bit boundary) and the + // clamped top-window case (above) where hi_mask was forced to 0. + sp.slice_localised_to_one_u64 = (hi_bits == 0); + } + return sp; +} + +/** + * @brief Read (window_bits+1) bits from `scalar_data` (uint64 limbs) using precomputed + * slice params and apply Constantine's signedWindowEncoding to produce a + * `(sign | bucket)` packed digit. + * + * Takes the slice params as scalar value parameters rather than a struct reference + * so the compiler can keep them in registers across the caller loop. + * + * `slice_localised_to_one_u64` selects the single-load fast path: ~75% of windows + * on typical 254-bit scalars (window_bits in [12, 19]) hit this. + */ +[[nodiscard]] [[gnu::always_inline]] inline uint32_t get_constantine_packed_digit(const uint64_t* scalar_data, + uint32_t lo_limb, + uint32_t hi_limb, + uint32_t lo_off, + uint32_t lo_bits, + uint32_t lo_mask, + uint32_t hi_mask, + bool slice_localised_to_one_u64, + size_t window_bits) noexcept +{ + uint64_t raw_wide = 0; + if (slice_localised_to_one_u64) { + // Fast path: one load + shift + mask. hi_part vanishes (hi_mask == 0); skip it. + raw_wide = (scalar_data[lo_limb] >> lo_off) & lo_mask; + } else if (lo_mask == 0) { + // Bottom-window fast path: synthetic-zero lookback bit, so the lo_part contribution is + // always 0 (lo_mask == 0). Skip the lo limb load entirely. lo_bits == 1 here, so the + // shift plants the window_bits-bit slice at bits 1..window_bits with bit 0 = 0. + // sp_lo_mask is loop-invariant within a window but is a runtime stack value, so the + // compiler does NOT constant-fold the `(s_lo >> lo_off) & 0 = 0` path inside the + // boundary branch; this explicit check saves ~3 ALU ops per scalar on the bottom window. + raw_wide = (scalar_data[hi_limb] & hi_mask) << lo_bits; + } else { + // Slow path: window straddles a uint64 boundary. + const uint64_t s_lo = scalar_data[lo_limb]; + const uint64_t s_hi = scalar_data[hi_limb]; + const uint64_t lo_part = (s_lo >> lo_off) & lo_mask; + const uint64_t hi_part = (s_hi & hi_mask) << lo_bits; + raw_wide = lo_part | hi_part; + } + // raw fits in window_bits+1 ≤ 32 bits, safe to narrow. + const uint32_t raw = static_cast(raw_wide); + + // signedWindowEncoding(raw, window_bits). raw fits in window_bits+1 bits; bit + // `window_bits` is the sign indicator. + // + // The conditional-negate trick `((encode + neg_mask) ^ neg_mask)` is the standard + // branchless idiom. We use the equivalent `(encode - neg) ^ neg_mask` to break the + // latency chain: `encode - neg` and `neg_mask = -neg` can issue in parallel (both + // depend only on `neg` / `encode`), whereas `encode + neg_mask` first waits for + // `neg_mask` to materialise. Saves one cycle on the inner-loop critical path + // (neg → neg_mask → +neg_mask → ^neg_mask → &val_mask vs neg → {neg_mask, enc_neg} + // in parallel → ^neg_mask → &val_mask). Identical result by: + // neg=0: enc_neg = encode, xored = encode ^ 0 = encode. ✓ + // neg=1: enc_neg = encode−1, xored = (encode−1) ^ −1 = ~(encode−1) = −encode. ✓ + const uint32_t neg = (raw >> window_bits) & uint32_t{ 1 }; + const uint32_t neg_mask = uint32_t{ 0 } - neg; // 0 or 0xFFFFFFFF + const uint32_t val_mask = (uint32_t{ 1 } << window_bits) - 1; + const uint32_t encode = (raw + 1) >> 1; + const uint32_t bucket_idx = ((encode - neg) ^ neg_mask) & val_mask; + + // Pack into (sign | bucket): sign in bit 31, bucket magnitude in the low bits. + return (neg << 31) | bucket_idx; +} + +// 128-bit SIMD-friendly 4-wide variant of get_constantine_packed_digit. Computes 4 packed +// digits in parallel via GCC's vector_size extension, which lowers to native SIMD on x86 +// (SSE2), ARM (NEON), and WASM (wasm-simd128). The branch on slice path is hoisted from +// the per-call site to the per-window outer loop, so callers select the localised / bottom / +// boundary specialisation once per window. +// +// We index the scalar via a `const uint32_t*` view rather than the natural `uint64_t*`: +// each lane is one uint32, so a 128-bit SIMD register holds 4 (raw, encode, bucket, …) +// values. `scalar.data` is a `std::array` whose byte layout is identical to +// `uint32_t[8]` on every target we ship to (x86 / ARM / WASM are all little-endian, and the +// codebase already assumes this layout in many places — `from_montgomery`, `uint256_t`, +// etc.). The reinterpret_cast is the same alias pattern. +// +// Returns the four packed digits in `out[0..3]`. The caller scatters them individually, +// since the consuming writes are not vectorisable. Switching from 2-wide uint64 to 4-wide +// uint32 doubles the compute throughput per SIMD instruction at the cost of slightly more +// straddle hits. +using SimdU32x4 = uint32_t __attribute__((vector_size(16))); + +// Helpers return `SimdU32x4` directly so the v128 stays in the SIMD register file end-to-end. +// Wrapping in a 4-uint32 struct round-tripped the v128 through 4 scalar memory slots. + +// uint32-indexed Constantine slice params, mirroring `ConstantineSliceParams` but with +// limb indices measured in 32-bit (rather than 64-bit) chunks. Computed once per window in +// `compute_constantine_slice_params_u32`; consumed by the SIMD x4 helpers below. +struct ConstantineSliceParamsU32 { + uint32_t lo_mask; + uint32_t hi_mask; + uint32_t lo_limb; // u32 limb index of the lookback bit + uint32_t hi_limb; // == lo_limb + 1, clamped to last in-range u32 limb at the top window + uint32_t lo_off; // bit-offset of the lookback bit within `lo_limb` + uint32_t lo_bits; // # bits read from `lo_limb` (also acts as the hi_part left-shift amount) + bool slice_localised_to_one_u32; + bool is_bottom_window; +}; + +[[nodiscard]] inline ConstantineSliceParamsU32 compute_constantine_slice_params_u32(size_t bit_offset, + size_t window_bits, + size_t num_u32_limbs) noexcept +{ + constexpr size_t LIMB_BITS_U32 = 32; + ConstantineSliceParamsU32 sp; + if (bit_offset == 0) { + sp.lo_limb = 0; + sp.hi_limb = 0; + sp.lo_off = LIMB_BITS_U32 - 1; + sp.lo_bits = 1; + sp.lo_mask = 0; + sp.hi_mask = (uint32_t{ 1 } << window_bits) - 1; + sp.slice_localised_to_one_u32 = false; + sp.is_bottom_window = true; + } else { + const size_t lookback_bit = bit_offset - 1; + const size_t bits_to_read = window_bits + 1; + sp.lo_limb = static_cast(lookback_bit / LIMB_BITS_U32); + sp.lo_off = static_cast(lookback_bit & (LIMB_BITS_U32 - 1)); + const uint32_t in_lo = static_cast(LIMB_BITS_U32 - sp.lo_off); + sp.lo_bits = (in_lo < static_cast(bits_to_read)) ? in_lo : static_cast(bits_to_read); + const uint32_t hi_bits = static_cast(bits_to_read) - sp.lo_bits; + sp.lo_mask = (sp.lo_bits == LIMB_BITS_U32) ? ~uint32_t{ 0 } : ((uint32_t{ 1 } << sp.lo_bits) - 1); + if (static_cast(sp.lo_limb) + 1 >= num_u32_limbs) { + sp.hi_limb = sp.lo_limb; + sp.hi_mask = 0; + } else { + sp.hi_limb = sp.lo_limb + 1; + sp.hi_mask = (uint32_t{ 1 } << hi_bits) - 1; + } + sp.slice_localised_to_one_u32 = (hi_bits == 0); + sp.is_bottom_window = false; + } + return sp; +} + +// Gather 4 disjoint uint32 values into one v128 via wasm v128.load32_lane. On WASM this +// is 1 splat + 3 load32_lane (4 ops); brace-init `{a, b, c, d}` with runtime values emits +// 4 scalar i32.load + 1 splat + 3 replace_lane (8 ops). On native it falls back to brace- +// init which clang lowers to NEON ins / SSE2 pinsrd. +[[nodiscard]] [[gnu::always_inline]] inline SimdU32x4 gather_x4_u32( + const uint32_t* p0, const uint32_t* p1, const uint32_t* p2, const uint32_t* p3, uint32_t idx) noexcept +{ +#ifdef __wasm_simd128__ + v128_t v = wasm_i32x4_splat(0); + v = wasm_v128_load32_lane(p0 + idx, v, 0); + v = wasm_v128_load32_lane(p1 + idx, v, 1); + v = wasm_v128_load32_lane(p2 + idx, v, 2); + v = wasm_v128_load32_lane(p3 + idx, v, 3); + return reinterpret_cast(v); +#else + return SimdU32x4{ p0[idx], p1[idx], p2[idx], p3[idx] }; +#endif +} + +// Store a `SimdU32x4` to a 4-lane uint32 destination as a single 128-bit op. +// On WASM the explicit `wasm_v128_store` is used because earlier codegen for +// the equivalent struct-wrapper assignment was observed to round-trip the +// vector through 4 scalar memory slots; the intrinsic guarantees the +// `i32x4.store` opcode. On native the `vector_size` store lowers directly to +// SSE2 `movdqu` / NEON `st1`. +[[gnu::always_inline]] inline void simd_u32x4_store(uint32_t* dst, SimdU32x4 v) noexcept +{ +#ifdef __wasm_simd128__ + wasm_v128_store(dst, reinterpret_cast(v)); +#else + *reinterpret_cast(dst) = v; +#endif +} + +// All four mask / constant v128s (lo_mask_v, hi_mask_v, one_v, val_mask) are loop-invariant +// within a window. Callers build them ONCE per window in the outer-w loop and pass them in, +// so the inner-i compute loop has zero v128.const / splat / shl+sub for the masks. +// `neg_mask = -neg` uses GCC vector-ext unary minus which lowers to `i32x4.neg` on WASM. +// +// Helpers write the v128 result directly into the caller-provided 4-lane destination buffer. +[[gnu::always_inline]] inline void store_constantine_packed_digits_x4_localised(uint32_t* dst, + const uint32_t* scalar_data_0, + const uint32_t* scalar_data_1, + const uint32_t* scalar_data_2, + const uint32_t* scalar_data_3, + uint32_t lo_limb, + uint32_t lo_off, + SimdU32x4 lo_mask_v, + SimdU32x4 one_v, + SimdU32x4 val_mask, + uint32_t window_bits) noexcept +{ + const SimdU32x4 lo = gather_x4_u32(scalar_data_0, scalar_data_1, scalar_data_2, scalar_data_3, lo_limb); + const SimdU32x4 raw = (lo >> lo_off) & lo_mask_v; + const SimdU32x4 neg = (raw >> window_bits) & one_v; + const SimdU32x4 neg_mask = -neg; + const SimdU32x4 encode = (raw + one_v) >> 1; + const SimdU32x4 bucket = ((encode - neg) ^ neg_mask) & val_mask; + const SimdU32x4 packed = (neg << 31) | bucket; + simd_u32x4_store(dst, packed); +} + +[[gnu::always_inline]] inline void store_constantine_packed_digits_x4_bottom(uint32_t* dst, + const uint32_t* scalar_data_0, + const uint32_t* scalar_data_1, + const uint32_t* scalar_data_2, + const uint32_t* scalar_data_3, + uint32_t hi_limb, + uint32_t lo_bits, + SimdU32x4 hi_mask_v, + SimdU32x4 one_v, + SimdU32x4 val_mask, + uint32_t window_bits) noexcept +{ + const SimdU32x4 hi = gather_x4_u32(scalar_data_0, scalar_data_1, scalar_data_2, scalar_data_3, hi_limb); + const SimdU32x4 raw = (hi & hi_mask_v) << lo_bits; + const SimdU32x4 neg = (raw >> window_bits) & one_v; + const SimdU32x4 neg_mask = -neg; + const SimdU32x4 encode = (raw + one_v) >> 1; + const SimdU32x4 bucket = ((encode - neg) ^ neg_mask) & val_mask; + const SimdU32x4 packed = (neg << 31) | bucket; + simd_u32x4_store(dst, packed); +} + +[[gnu::always_inline]] inline void store_constantine_packed_digits_x4_boundary(uint32_t* dst, + const uint32_t* scalar_data_0, + const uint32_t* scalar_data_1, + const uint32_t* scalar_data_2, + const uint32_t* scalar_data_3, + uint32_t lo_limb, + uint32_t hi_limb, + uint32_t lo_off, + uint32_t lo_bits, + SimdU32x4 lo_mask_v, + SimdU32x4 hi_mask_v, + SimdU32x4 one_v, + SimdU32x4 val_mask, + uint32_t window_bits) noexcept +{ + const SimdU32x4 lo = gather_x4_u32(scalar_data_0, scalar_data_1, scalar_data_2, scalar_data_3, lo_limb); + const SimdU32x4 hi = gather_x4_u32(scalar_data_0, scalar_data_1, scalar_data_2, scalar_data_3, hi_limb); + const SimdU32x4 lo_part = (lo >> lo_off) & lo_mask_v; + const SimdU32x4 hi_part = (hi & hi_mask_v) << lo_bits; + const SimdU32x4 raw = lo_part | hi_part; + const SimdU32x4 neg = (raw >> window_bits) & one_v; + const SimdU32x4 neg_mask = -neg; + const SimdU32x4 encode = (raw + one_v) >> 1; + const SimdU32x4 bucket = ((encode - neg) ^ neg_mask) & val_mask; + const SimdU32x4 packed = (neg << 31) | bucket; + simd_u32x4_store(dst, packed); +} + +// Path-selector enum used to dispatch on the SIMD specialisation once per window rather +// than once per scalar. +enum class ConstantineSlicePath : uint8_t { + Localised = 0, + Bottom = 1, + Boundary = 2, +}; + +[[nodiscard]] [[gnu::always_inline]] inline ConstantineSlicePath classify_slice_path_u32( + const ConstantineSliceParamsU32& sp) noexcept +{ + if (sp.is_bottom_window) { + return ConstantineSlicePath::Bottom; + } + if (sp.slice_localised_to_one_u32) { + return ConstantineSlicePath::Localised; + } + return ConstantineSlicePath::Boundary; +} + +} // namespace bb::scalar_multiplication::round_parallel_detail diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.test.cpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.test.cpp new file mode 100644 index 000000000000..199166832bc7 --- /dev/null +++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.test.cpp @@ -0,0 +1,443 @@ +// Unit tests for the Constantine signed-Booth window recoder used by the +// round-parallel Pippenger MSM. Validates the scalar packed-digit recoder, +// the SIMD x4 specialisations (Localised / Bottom / Boundary), and the +// round-trip identity `Σ_w (-1)^sign_w · bucket_w · 2^{B_w} ≡ scalar`. + +#include "pippenger_constantine.hpp" + +#include "barretenberg/ecc/curves/bn254/fr.hpp" +#include "barretenberg/numeric/random/engine.hpp" +#include "barretenberg/numeric/uint256/uint256.hpp" + +#include +#include +#include +#include + +namespace { + +namespace cnst = bb::scalar_multiplication::round_parallel_detail; +using ScalarField = bb::fr; +auto& engine = bb::numeric::get_randomness(); + +constexpr size_t LIMB_BITS_U64 = 64; +constexpr size_t NUM_LIMBS_U64 = 4; +constexpr size_t NUM_LIMBS_U32 = 8; +constexpr size_t MAX_BITS = 256; + +// ============================================================================= +// Reference signed-window encoder. Reads `(window_bits + 1)` bits from the +// scalar starting at `bit_offset - 1` (with a synthetic 0 at bit -1 when +// bit_offset == 0), then applies the signed-Booth encode: +// +// raw = bits [bit_offset-1, bit_offset + window_bits) +// neg = raw >> window_bits (top bit = sign indicator) +// encode = (raw + 1) >> 1 (drop the lookback bit) +// bucket = (encode - neg) ^ (-neg) (conditional negate, branchless) +// packed = (neg << 31) | bucket +// +// Same algebra as `get_constantine_packed_digit`, but implemented in the most +// obvious way against a flat `bit_at(i)` accessor so any error in the +// production path's limb-walking or branchless conditional negate will diverge. +// ============================================================================= +uint32_t reference_packed_digit(const uint64_t* scalar_data, size_t bit_offset, size_t window_bits) +{ + auto bit_at = [&](int64_t i) -> uint64_t { + if (i < 0 || static_cast(i) >= MAX_BITS) { + return 0; + } + return (scalar_data[static_cast(i) / LIMB_BITS_U64] >> (static_cast(i) % LIMB_BITS_U64)) & + uint64_t{ 1 }; + }; + uint32_t raw = 0; + for (size_t k = 0; k <= window_bits; ++k) { + const int64_t bit_idx = static_cast(bit_offset) + static_cast(k) - 1; + raw |= static_cast(bit_at(bit_idx)) << k; + } + const uint32_t neg = (raw >> window_bits) & 1U; + const uint32_t val_mask = (uint32_t{ 1 } << window_bits) - 1; + const uint32_t encode = (raw + 1) >> 1; + const uint32_t bucket = ((encode - neg) ^ (uint32_t{ 0 } - neg)) & val_mask; + return (neg << 31) | bucket; +} + +// Random non-Montgomery scalar — uniform over [0, modulus). We invoke the +// recoder against the raw limbs so the random_element form is irrelevant; what +// matters is that the limb bytes are arbitrary. +std::array random_scalar_limbs() +{ + std::array out{}; + for (size_t i = 0; i < NUM_LIMBS_U64; ++i) { + out[i] = engine.get_random_uint64(); + } + return out; +} + +// View the same scalar as a uint32 limb array (little-endian: x86/ARM/WASM all +// agree). The SIMD x4 helpers index by uint32 limbs. +const uint32_t* as_u32(const std::array& s) +{ + return reinterpret_cast(s.data()); +} + +// Drive `get_constantine_packed_digit` via the params returned by +// `compute_constantine_slice_params`. The hot loop in Stage 1 / Stage 4 unpacks +// the struct into scalar values; we mirror that call shape exactly so a future +// API change here would be caught. +uint32_t production_scalar_path(const uint64_t* scalar_data, size_t bit_offset, size_t window_bits) +{ + const auto sp = cnst::compute_constantine_slice_params(bit_offset, window_bits, NUM_LIMBS_U64); + return cnst::get_constantine_packed_digit(scalar_data, + sp.lo_limb, + sp.hi_limb, + sp.lo_off, + sp.lo_bits, + sp.lo_mask, + sp.hi_mask, + sp.slice_localised_to_one_u64, + window_bits); +} + +// Drive the 4-wide SIMD specialisations by classifying the slice path and +// calling the matching `store_constantine_packed_digits_x4_*` helper. Out[i] +// is the packed digit for the i-th scalar. Mirrors Stage 1's per-window +// dispatch loop in `scalar_multiplication.cpp`. +void production_simd_path(const std::array scalars[4], + size_t bit_offset, + size_t window_bits, + uint32_t out[4]) +{ + const auto sp = cnst::compute_constantine_slice_params_u32(bit_offset, window_bits, NUM_LIMBS_U32); + const cnst::SimdU32x4 lo_mask_v{ sp.lo_mask, sp.lo_mask, sp.lo_mask, sp.lo_mask }; + const cnst::SimdU32x4 hi_mask_v{ sp.hi_mask, sp.hi_mask, sp.hi_mask, sp.hi_mask }; + const cnst::SimdU32x4 one_v{ 1, 1, 1, 1 }; + const uint32_t val_mask_scalar = (uint32_t{ 1 } << window_bits) - 1; + const cnst::SimdU32x4 val_mask{ val_mask_scalar, val_mask_scalar, val_mask_scalar, val_mask_scalar }; + + const uint32_t* s0 = as_u32(scalars[0]); + const uint32_t* s1 = as_u32(scalars[1]); + const uint32_t* s2 = as_u32(scalars[2]); + const uint32_t* s3 = as_u32(scalars[3]); + + const uint32_t wb_u32 = static_cast(window_bits); + switch (cnst::classify_slice_path_u32(sp)) { + case cnst::ConstantineSlicePath::Localised: + cnst::store_constantine_packed_digits_x4_localised( + out, s0, s1, s2, s3, sp.lo_limb, sp.lo_off, lo_mask_v, one_v, val_mask, wb_u32); + break; + case cnst::ConstantineSlicePath::Bottom: + cnst::store_constantine_packed_digits_x4_bottom( + out, s0, s1, s2, s3, sp.hi_limb, sp.lo_bits, hi_mask_v, one_v, val_mask, wb_u32); + break; + case cnst::ConstantineSlicePath::Boundary: + cnst::store_constantine_packed_digits_x4_boundary(out, + s0, + s1, + s2, + s3, + sp.lo_limb, + sp.hi_limb, + sp.lo_off, + sp.lo_bits, + lo_mask_v, + hi_mask_v, + one_v, + val_mask, + wb_u32); + break; + } +} + +} // namespace + +// ============================================================================= +// Test 1 — Scalar packed-digit recoder matches the textbook reference oracle +// across all `(window_bits, bit_offset)` pairs the live pipeline ever issues. +// ============================================================================= +TEST(PippengerConstantine, ScalarMatchesReferenceOracleAllWindowBits) +{ + constexpr size_t TRIALS_PER_SHAPE = 32; + // window_bits range covers production: choose_window_bits returns 2..19, + // build_var_window_schedule's final window can additionally be 1 bit wide + // (e.g. wb=3 over 256 bits yields 85*3 + 1). bit_offset 255 covers the + // above-modulus top edge where every read bit is structurally zero. + for (size_t window_bits = 1; window_bits <= 19; ++window_bits) { + for (size_t bit_offset = 0; bit_offset <= 255; ++bit_offset) { + for (size_t t = 0; t < TRIALS_PER_SHAPE; ++t) { + const auto s = random_scalar_limbs(); + const uint32_t got = production_scalar_path(s.data(), bit_offset, window_bits); + const uint32_t want = reference_packed_digit(s.data(), bit_offset, window_bits); + ASSERT_EQ(got, want) << "window_bits=" << window_bits << " bit_offset=" << bit_offset << " trial=" << t; + } + } + } +} + +// ============================================================================= +// Test 2 — SIMD x4 path agrees with the scalar path lane-by-lane across all +// three specialisations (Localised / Bottom / Boundary). Each bit_offset +// implicitly selects which specialisation runs; we sweep every offset so all +// three are exercised. +// ============================================================================= +TEST(PippengerConstantine, SimdX4MatchesScalarPathLanewise) +{ + constexpr size_t TRIALS_PER_SHAPE = 16; + bool saw_localised = false; + bool saw_bottom = false; + bool saw_boundary = false; + // window_bits range covers production: choose_window_bits returns 2..19, + // build_var_window_schedule's final window can additionally be 1 bit wide + // (e.g. wb=3 over 256 bits yields 85*3 + 1). bit_offset 255 covers the + // above-modulus top edge where every read bit is structurally zero. + for (size_t window_bits = 1; window_bits <= 19; ++window_bits) { + for (size_t bit_offset = 0; bit_offset <= 255; ++bit_offset) { + const auto sp_u32 = cnst::compute_constantine_slice_params_u32(bit_offset, window_bits, NUM_LIMBS_U32); + switch (cnst::classify_slice_path_u32(sp_u32)) { + case cnst::ConstantineSlicePath::Localised: + saw_localised = true; + break; + case cnst::ConstantineSlicePath::Bottom: + saw_bottom = true; + break; + case cnst::ConstantineSlicePath::Boundary: + saw_boundary = true; + break; + } + for (size_t t = 0; t < TRIALS_PER_SHAPE; ++t) { + std::array, 4> scalars{ + random_scalar_limbs(), random_scalar_limbs(), random_scalar_limbs(), random_scalar_limbs() + }; + std::array got_simd{}; + production_simd_path(scalars.data(), bit_offset, window_bits, got_simd.data()); + for (size_t lane = 0; lane < 4; ++lane) { + const uint32_t want = production_scalar_path(scalars[lane].data(), bit_offset, window_bits); + ASSERT_EQ(got_simd[lane], want) + << "window_bits=" << window_bits << " bit_offset=" << bit_offset << " lane=" << lane; + } + } + } + } + // The sweep must exercise all three specialisations or the SIMD coverage is + // a no-op for a path. (Coverage check, not a behavioural claim.) + EXPECT_TRUE(saw_localised); + EXPECT_TRUE(saw_bottom); + EXPECT_TRUE(saw_boundary); +} + +// ============================================================================= +// Test 3 — Round-trip identity. For any tiled window schedule covering +// [0, total_bits) bits, the sum `Σ_w (-1)^sign_w · bucket_w · 2^{B_w}` must +// equal the scalar value modulo 2^total_bits. This is the load-bearing +// algebraic invariant the whole MSM rests on; if it ever fails the rest of +// the pipeline silently mis-computes the result. +// ============================================================================= +TEST(PippengerConstantine, RoundTripIdentityMatchesScalarMod2N) +{ + constexpr size_t TOTAL_BITS = 254; + constexpr size_t TRIALS = 64; + // Including window_bits == 1 because `build_var_window_schedule` truncates + // the final window to whatever bits remain, which can be exactly 1. + for (size_t window_bits = 1; window_bits <= 19; ++window_bits) { + for (size_t t = 0; t < TRIALS; ++t) { + const auto s = random_scalar_limbs(); + // Recover scalar value as a 256-bit big integer (4 × uint64). + // We reconstruct it limb-by-limb using __int128 arithmetic so the + // round-trip is plainly readable; production code uses field + // arithmetic, which we deliberately avoid here. + // + // Tile windows of width `window_bits` until we cover TOTAL_BITS+2 + // bits. The +2 mirrors the `total_bits = num_bits + 2` budget used + // by `build_var_window_schedule` to absorb the carry-less top bit. + std::vector> signed_digits; // (signed_value, bit_offset) + size_t bit_offset = 0; + size_t bits_remaining = TOTAL_BITS + 2; + while (bits_remaining > 0) { + const size_t wb = std::min(window_bits, bits_remaining); + const uint32_t packed = production_scalar_path(s.data(), bit_offset, wb); + const uint32_t neg = packed >> 31; + const uint32_t bucket = packed & ((uint32_t{ 1 } << wb) - 1); + const int32_t signed_val = (neg != 0U) ? -static_cast(bucket) : static_cast(bucket); + signed_digits.emplace_back(signed_val, bit_offset); + bit_offset += wb; + bits_remaining -= wb; + } + + // Reconstruct: Σ_w signed_val_w · 2^{bit_offset_w} mod 2^256, using + // uint256_t arithmetic where signed subtraction is just `acc -= |v| << off`. + bb::numeric::uint256_t acc(0); + for (const auto& [v, off] : signed_digits) { + const bb::numeric::uint256_t shifted = bb::numeric::uint256_t(static_cast(v < 0 ? -v : v)) + << bb::numeric::uint256_t(off); + if (v < 0) { + acc -= shifted; + } else { + acc += shifted; + } + } + const bb::numeric::uint256_t scalar_val(s[0], s[1], s[2], s[3]); + EXPECT_EQ(acc, scalar_val) << "window_bits=" << window_bits << " trial=" << t; + } + } +} + +// ============================================================================= +// Test 4 — Edge cases. Pin the structural boundaries explicitly so a regression +// at one of them (rather than at a random bit) shows up as a named failure. +// ============================================================================= +TEST(PippengerConstantine, EdgeCases) +{ + // (a) Zero scalar — every packed digit must be 0 (sign 0, bucket 0). + // Sweep includes wb=1 (final-window truncation) and bit_offset=255 + // (above-modulus top edge — every bit read is structurally zero). + std::array zero{}; + for (size_t wb = 1; wb <= 19; ++wb) { + for (size_t off = 0; off <= 255; ++off) { + EXPECT_EQ(production_scalar_path(zero.data(), off, wb), uint32_t{ 0 }) + << "zero scalar wb=" << wb << " off=" << off; + } + } + + // (b) Bottom window — bit_offset == 0 must select the synthetic-zero + // lookback path. The classifier flags it via `is_bottom_window`. + const auto sp_bottom = cnst::compute_constantine_slice_params_u32(0, 12, NUM_LIMBS_U32); + EXPECT_TRUE(sp_bottom.is_bottom_window); + EXPECT_EQ(cnst::classify_slice_path_u32(sp_bottom), cnst::ConstantineSlicePath::Bottom); + + // (c) Top window — when the natural hi_limb read lands past the scalar's + // storage, the production code clamps `hi_limb` and zeros `hi_mask`. The + // packed digit must still match the reference oracle (which extends with + // zeros above bit 256). Sweep all the way to bit_offset=255 to cover the + // above-modulus case where every read bit is structurally zero. + auto top_aligned = random_scalar_limbs(); + constexpr size_t window_bits = 12; + for (size_t bit_offset = 240; bit_offset <= 255; ++bit_offset) { + const uint32_t got = production_scalar_path(top_aligned.data(), bit_offset, window_bits); + const uint32_t want = reference_packed_digit(top_aligned.data(), bit_offset, window_bits); + EXPECT_EQ(got, want) << "top window bit_offset=" << bit_offset; + } + + // (d) Localised fast path — the c+1-bit window must fit inside a single + // uint64 limb for the localised path to be selected. With window_bits=12 + // and bit_offset=10, the lookback bit is at limb 0, bit 9; the window + // spans bits 10..21 — all inside limb 0, so localised path fires. + const auto sp_local = cnst::compute_constantine_slice_params(10, 12, NUM_LIMBS_U64); + EXPECT_TRUE(sp_local.slice_localised_to_one_u64); + + // (e) Boundary case — when the window straddles a uint64 boundary the + // localised flag must be false. With window_bits=12 and bit_offset=60, + // the window spans bits 59..71 → crosses bit 63→64. + const auto sp_boundary = cnst::compute_constantine_slice_params(60, 12, NUM_LIMBS_U64); + EXPECT_FALSE(sp_boundary.slice_localised_to_one_u64); +} + +// ============================================================================= +// Test 5 — Named slice-shape table. Random sweeps probably hit every +// (limb_index, slice_path) combination, but a regression at one of these +// boundaries (e.g. "boundary across bit 31→32, lookback in lo half") shows up +// as a named failure here rather than an opaque "trial 17 of 32" log line. +// +// `bit_offset` here is the absolute bit position of the FIRST window bit; the +// lookback bit lives at `bit_offset - 1`. Each row pins the (bit_offset, wb) +// pair, the expected slice path under u32 indexing, and the expected +// localisation under u64 indexing. +// ============================================================================= +TEST(PippengerConstantine, NamedSliceShapes) +{ + struct ShapeCase { + const char* name; + size_t bit_offset; + size_t window_bits; + cnst::ConstantineSlicePath u32_path; + bool u64_localised; // expected `slice_localised_to_one_u64` + }; + // Picked so each row exercises a structurally distinct shape: + // - bottom_* : synthetic-lookback path + // - local_* : c+1 bits fit inside a single u64 limb (and matching u32) + // - boundary_* : window straddles a u64 or u32 limb boundary + // - top_clamped : hi_limb would land past scalar storage → clamp + zero mask + const std::array cases{ { + // Bottom — bit_offset 0 across several wb. + { "bottom_wb12", 0, 12, cnst::ConstantineSlicePath::Bottom, false }, + { "bottom_wb2", 0, 2, cnst::ConstantineSlicePath::Bottom, false }, + { "bottom_wb19", 0, 19, cnst::ConstantineSlicePath::Bottom, false }, + // Localised — lookback + window inside a single u32 (and therefore a single u64). + { "local_lo_u32", 10, 12, cnst::ConstantineSlicePath::Localised, true }, + // Localised in u64 but boundary in u32 — lookback at bit 30 (u32 limb 0), window spans bits 30..42 + // (crosses u32 bit 31→32) but stays inside u64 limb 0. + { "local_u64_boundary_u32", 31, 12, cnst::ConstantineSlicePath::Boundary, true }, + // Boundary across u64 bit 63→64. + { "boundary_u64_at_63", 60, 12, cnst::ConstantineSlicePath::Boundary, false }, + { "boundary_u64_at_127", 124, 12, cnst::ConstantineSlicePath::Boundary, false }, + { "boundary_u64_at_191", 188, 12, cnst::ConstantineSlicePath::Boundary, false }, + // Boundary at u32 bit 31→32 with lookback in low half. + { "boundary_u32_at_31", 30, 4, cnst::ConstantineSlicePath::Boundary, true }, + // Top window — clamp regime. With wb=12, bit_offset=246 reads bits 245..257; hi limb is past + // the scalar's 256-bit storage in u32 view (limb_index 7 is the last). + { "top_clamped_wb12", 246, 12, cnst::ConstantineSlicePath::Boundary, false }, + // wb=1 at the very top — the final-window case `build_var_window_schedule` can emit. + { "top_wb1_final", 254, 1, cnst::ConstantineSlicePath::Localised, true }, + // Random mid-scalar localised case as a "happy path" anchor. + { "local_mid_u64", 80, 12, cnst::ConstantineSlicePath::Localised, true }, + } }; + + auto s = random_scalar_limbs(); + for (const auto& c : cases) { + const auto sp_u32 = cnst::compute_constantine_slice_params_u32(c.bit_offset, c.window_bits, NUM_LIMBS_U32); + const auto sp_u64 = cnst::compute_constantine_slice_params(c.bit_offset, c.window_bits, NUM_LIMBS_U64); + EXPECT_EQ(cnst::classify_slice_path_u32(sp_u32), c.u32_path) << "case=" << c.name; + EXPECT_EQ(sp_u64.slice_localised_to_one_u64, c.u64_localised) << "case=" << c.name; + + // The encoder must still produce the reference value at each named shape. + const uint32_t got = production_scalar_path(s.data(), c.bit_offset, c.window_bits); + const uint32_t want = reference_packed_digit(s.data(), c.bit_offset, c.window_bits); + EXPECT_EQ(got, want) << "case=" << c.name; + } +} + +// ============================================================================= +// Test 6 — u64 / u32 param classifier internal consistency. +// +// The scalar path uses `ConstantineSliceParams` (u64-indexed); the SIMD path +// uses `ConstantineSliceParamsU32` (u32-indexed). Comparing final packed +// digits (Test 1+2) catches END-to-END divergence, but a compensating bug +// across the two param computations could mask itself. This test asserts the +// param structs encode the SAME lookback bit position and the SAME read width +// where their definitions agree, so a bug in one classifier alone shows up +// even if the digits happen to round-trip. +// ============================================================================= +TEST(PippengerConstantine, ParamClassifierU64U32Consistency) +{ + for (size_t wb = 1; wb <= 19; ++wb) { + for (size_t bit_offset = 0; bit_offset <= 255; ++bit_offset) { + const auto sp_u64 = cnst::compute_constantine_slice_params(bit_offset, wb, NUM_LIMBS_U64); + const auto sp_u32 = cnst::compute_constantine_slice_params_u32(bit_offset, wb, NUM_LIMBS_U32); + + // Bottom-window classification: both must agree (u64 signals via lo_mask==0, + // u32 via the explicit is_bottom_window flag). + const bool u64_says_bottom = (sp_u64.lo_mask == 0); + EXPECT_EQ(u64_says_bottom, sp_u32.is_bottom_window) + << "bottom classification disagrees at bit_offset=" << bit_offset << " wb=" << wb; + + // Lookback bit absolute position: lo_limb·LIMB_BITS + lo_off. Both views must + // identify the same absolute bit (skip bottom, where the lookback is synthetic + // and the limb/offset encoding is intentionally not a real position). + if (!sp_u32.is_bottom_window) { + const size_t u64_lookback = sp_u64.lo_limb * 64 + sp_u64.lo_off; + const size_t u32_lookback = sp_u32.lo_limb * 32 + sp_u32.lo_off; + EXPECT_EQ(u64_lookback, u32_lookback) + << "lookback bit disagrees at bit_offset=" << bit_offset << " wb=" << wb; + EXPECT_EQ(u64_lookback, bit_offset - 1) + << "lookback bit ≠ bit_offset-1 at bit_offset=" << bit_offset << " wb=" << wb; + } + + // Localised-flag implication: u64-localised means the whole c+1 window lives in + // one u64 limb. That does NOT imply u32-localised (window could still straddle + // a u32 boundary inside the same u64), but it DOES imply the u32 view's slice + // path is NOT Bottom (bit_offset > 0 cases only). + if (sp_u64.slice_localised_to_one_u64 && bit_offset > 0) { + EXPECT_NE(cnst::classify_slice_path_u32(sp_u32), cnst::ConstantineSlicePath::Bottom) + << "u64-localised but u32 classifier says Bottom at bit_offset=" << bit_offset << " wb=" << wb; + } + } + } +} From 43a1a1b7bc31bf5f891a15d400903d6ee05ec24f Mon Sep 17 00:00:00 2001 From: sergei iakovenko <105737703+iakovenkos@users.noreply.github.com> Date: Tue, 26 May 2026 16:31:20 +0200 Subject: [PATCH 2/2] fix: revert extract Constantine signed-Booth window recoder (#23561) Reverts d76a3163082 which was pushed directly to merge-train/barretenberg without a PR. Will re-land via proper PR from `si/pippenger-constantine`. --- .../pippenger_constantine.fuzzer.cpp | 160 ------- .../pippenger_constantine.hpp | 389 --------------- .../pippenger_constantine.test.cpp | 443 ------------------ 3 files changed, 992 deletions(-) delete mode 100644 barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.fuzzer.cpp delete mode 100644 barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.hpp delete mode 100644 barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.test.cpp diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.fuzzer.cpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.fuzzer.cpp deleted file mode 100644 index 451a9baa31b9..000000000000 --- a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.fuzzer.cpp +++ /dev/null @@ -1,160 +0,0 @@ -// libFuzzer target for the Constantine signed-Booth window recoder. -// -// Two-pronged differential check on each input: -// 1. Scalar path vs textbook reference oracle — catches encoder algebra bugs. -// 2. SIMD x4 path vs scalar path (lane-by-lane) — catches lane-mux / mask / -// vector-shift bugs in the three slice-path specialisations. -// -// Input layout: 1 byte window_bits ∈ [2, 18], 1 byte bit_offset ∈ [0, 254], -// followed by 32 bytes × 4 = 128 bytes of scalar limb material. Total minimum -// input = 130 bytes; smaller inputs are zero-padded so libFuzzer's empty-seed -// kickoff still drives the encoder. -// -// Run: -// cmake --preset fuzzing && cmake --build --preset fuzzing --target ecc_pippenger_constantine_fuzzer -// ./build-fuzzing/bin/ecc_pippenger_constantine_fuzzer -max_total_time=60 - -#include "pippenger_constantine.hpp" - -#include "barretenberg/numeric/uint256/uint256.hpp" - -#include -#include -#include - -namespace { - -namespace cnst = bb::scalar_multiplication::round_parallel_detail; - -constexpr size_t LIMB_BITS_U64 = 64; -constexpr size_t NUM_LIMBS_U64 = 4; -constexpr size_t NUM_LIMBS_U32 = 8; -constexpr size_t MAX_BITS = 256; -constexpr size_t SCALAR_BYTES = 32; - -uint32_t reference_packed_digit(const uint64_t* scalar_data, size_t bit_offset, size_t window_bits) -{ - auto bit_at = [&](int64_t i) -> uint64_t { - if (i < 0 || static_cast(i) >= MAX_BITS) { - return 0; - } - return (scalar_data[static_cast(i) / LIMB_BITS_U64] >> (static_cast(i) % LIMB_BITS_U64)) & - uint64_t{ 1 }; - }; - uint32_t raw = 0; - for (size_t k = 0; k <= window_bits; ++k) { - const int64_t bit_idx = static_cast(bit_offset) + static_cast(k) - 1; - raw |= static_cast(bit_at(bit_idx)) << k; - } - const uint32_t neg = (raw >> window_bits) & 1U; - const uint32_t val_mask = (uint32_t{ 1 } << window_bits) - 1; - const uint32_t encode = (raw + 1) >> 1; - const uint32_t bucket = ((encode - neg) ^ (uint32_t{ 0 } - neg)) & val_mask; - return (neg << 31) | bucket; -} - -uint32_t production_scalar(const uint64_t* scalar_data, size_t bit_offset, size_t window_bits) -{ - const auto sp = cnst::compute_constantine_slice_params(bit_offset, window_bits, NUM_LIMBS_U64); - return cnst::get_constantine_packed_digit(scalar_data, - sp.lo_limb, - sp.hi_limb, - sp.lo_off, - sp.lo_bits, - sp.lo_mask, - sp.hi_mask, - sp.slice_localised_to_one_u64, - window_bits); -} - -void production_simd(const std::array, 4>& scalars, - size_t bit_offset, - size_t window_bits, - std::array& out) -{ - const auto sp = cnst::compute_constantine_slice_params_u32(bit_offset, window_bits, NUM_LIMBS_U32); - const cnst::SimdU32x4 lo_mask_v{ sp.lo_mask, sp.lo_mask, sp.lo_mask, sp.lo_mask }; - const cnst::SimdU32x4 hi_mask_v{ sp.hi_mask, sp.hi_mask, sp.hi_mask, sp.hi_mask }; - const cnst::SimdU32x4 one_v{ 1, 1, 1, 1 }; - const uint32_t val_mask_scalar = (uint32_t{ 1 } << window_bits) - 1; - const cnst::SimdU32x4 val_mask{ val_mask_scalar, val_mask_scalar, val_mask_scalar, val_mask_scalar }; - const auto* s0 = reinterpret_cast(scalars[0].data()); - const auto* s1 = reinterpret_cast(scalars[1].data()); - const auto* s2 = reinterpret_cast(scalars[2].data()); - const auto* s3 = reinterpret_cast(scalars[3].data()); - const auto wb_u32 = static_cast(window_bits); - - switch (cnst::classify_slice_path_u32(sp)) { - case cnst::ConstantineSlicePath::Localised: - cnst::store_constantine_packed_digits_x4_localised( - out.data(), s0, s1, s2, s3, sp.lo_limb, sp.lo_off, lo_mask_v, one_v, val_mask, wb_u32); - break; - case cnst::ConstantineSlicePath::Bottom: - cnst::store_constantine_packed_digits_x4_bottom( - out.data(), s0, s1, s2, s3, sp.hi_limb, sp.lo_bits, hi_mask_v, one_v, val_mask, wb_u32); - break; - case cnst::ConstantineSlicePath::Boundary: - cnst::store_constantine_packed_digits_x4_boundary(out.data(), - s0, - s1, - s2, - s3, - sp.lo_limb, - sp.hi_limb, - sp.lo_off, - sp.lo_bits, - lo_mask_v, - hi_mask_v, - one_v, - val_mask, - wb_u32); - break; - } -} - -} // namespace - -extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) -{ - // Pad input to the minimum required length so empty / tiny seeds still - // exercise the encoder against zero-extended scalars. - constexpr size_t MIN_INPUT = 2 + (SCALAR_BYTES * 4); - std::array buf{}; - std::memcpy(buf.data(), data, std::min(size, MIN_INPUT)); - - // window_bits ∈ [1, 19] — `choose_window_bits` returns [2,19]; the final - // window emitted by `build_var_window_schedule` can additionally be 1 bit - // (e.g. wb=3 over 256 bits = 85*3+1). Outside this range the encoder has - // no well-defined behavior in production. - const size_t window_bits = 1 + (buf[0] % 19); - // bit_offset ∈ [0, 255] — the live pipeline's range, including the top - // edge where bit_offset+wb extends past the scalar's 256 bits (production - // code clamps `hi_limb` and zeros `hi_mask`). - const size_t bit_offset = buf[1] & 0xff; - - std::array, 4> scalars{}; - for (size_t lane = 0; lane < 4; ++lane) { - std::memcpy(scalars[lane].data(), buf.data() + 2 + (lane * SCALAR_BYTES), SCALAR_BYTES); - } - - // Check 1: scalar path matches the textbook reference oracle. - for (size_t lane = 0; lane < 4; ++lane) { - const uint32_t got = production_scalar(scalars[lane].data(), bit_offset, window_bits); - const uint32_t want = reference_packed_digit(scalars[lane].data(), bit_offset, window_bits); - if (got != want) { - __builtin_trap(); - } - } - - // Check 2: SIMD x4 path agrees with scalar path lane-by-lane. - std::array simd_out{}; - production_simd(scalars, bit_offset, window_bits, simd_out); - for (size_t lane = 0; lane < 4; ++lane) { - const uint32_t want = production_scalar(scalars[lane].data(), bit_offset, window_bits); - if (simd_out[lane] != want) { - __builtin_trap(); - } - } - - return 0; -} diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.hpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.hpp deleted file mode 100644 index ec2c3e6800b2..000000000000 --- a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.hpp +++ /dev/null @@ -1,389 +0,0 @@ -// Constantine-style signed-Booth window recoder for Pippenger MSM. -// -// Given a scalar s = sum_i s_i 2^i and a window [b, b + c), this module computes a -// signed digit d in [-(2^c - 1), 2^c - 1] such that the scalar can be reconstructed as -// s = sum_w d_w 2^{b_w}. It returns d as a packed `(sign | bucket)` value, where -// `bucket = |d|` and `sign` records whether d is negative. -// -// Implements the carry-less `signedWindowEncoding` / `getSignedFullWindowAt` pattern from -// `constantine/math/arithmetic/bigints.nim`: each window reads c+1 bits including the -// previous window boundary bit, lets that shared boundary bit substitute for an explicit -// carry, and produces a `(sign | bucket)` packed digit. -// -// Assumptions: production callers pass `window_bits` in [1, 19] and bit offsets within a -// 256-bit scalar. The bit-twiddling below assumes `window_bits < 32`. -// -// Two parallel paths: -// * scalar path — `ConstantineSliceParams` + `get_constantine_packed_digit` (uint64- -// indexed limbs). -// * SIMD x4 path — `ConstantineSliceParamsU32` + `store_constantine_packed_digits_x4_*` -// (uint32-indexed limbs, processes 4 scalars per call via GCC vector_size). -// -// The SIMD helpers split on slice-path (Localised / Bottom / Boundary) so the per-window -// branch is hoisted out of the per-scalar loop. `classify_slice_path_u32` returns the -// matching enum for callers to dispatch on once per window. - -#pragma once - -#include -#include - -#ifdef __wasm_simd128__ -#include -#endif - -namespace bb::scalar_multiplication::round_parallel_detail { - -/** - * @brief Per-window precomputed slice parameters for the carry-less signed-Booth window - * recoding (after Constantine `signedWindowEncoding` / `getSignedFullWindowAt`, - * `constantine/math/arithmetic/bigints.nim`). Computed once per window by the - * caller; the per-scalar hot path is then fixed bit-twiddling with no per-iteration - * slice address arithmetic. - * Carry-less because every non-bottom window's c+1-bit read shares its boundary bit - * with the previous window — the bit a non-overlapping recoder would carry. - * - * `slice_localised_to_one_u64`: true iff every bit of the c+1-bit window lives inside a - * single uint64 limb. Most windows on typical 254-bit scalars with c in [12, 19] - * (lookback bits at non-boundary positions) hit this and take the fast path: one load, - * one shift, one mask. The slow path is the boundary-straddling case + the synthetic- - * lookback bottom window. - */ -struct ConstantineSliceParams { - uint32_t lo_mask; - uint32_t hi_mask; - uint32_t lo_limb; - uint32_t hi_limb; // == lo_limb + 1, except clamped to last valid limb at the top window - uint32_t lo_off; - uint32_t lo_bits; - bool slice_localised_to_one_u64; -}; - -/** - * @brief Compute the Constantine slice params for a window starting at absolute bit position - * `bit_offset` (= Σ_{k(lookback_bit / LIMB_BITS); - sp.lo_off = static_cast(lookback_bit & (LIMB_BITS - 1)); - sp.lo_bits = static_cast(LIMB_BITS - sp.lo_off < bits_to_read ? LIMB_BITS - sp.lo_off : bits_to_read); - const uint32_t hi_bits = static_cast(bits_to_read) - sp.lo_bits; - // window_bits+1 ≤ 32 for our windows ⇒ lo_bits ≤ 32 ⇒ mask fits in uint32. - sp.lo_mask = (uint32_t{ 1 } << sp.lo_bits) - 1; - // If the natural hi-limb read would land past the end of the scalar's storage, - // clamp `hi_limb` to a safe in-range index and mask its contribution to zero. The - // top window's hi_bits worth of bits are conceptually zero (scalar < 2^num_bits ≤ - // num_windows·window_bits). Re-reading lo_limb under a zero mask keeps the slow - // path's two unconditional limb loads branch-free. - if (static_cast(sp.lo_limb) + 1 >= num_uint64_limbs) { - sp.hi_limb = sp.lo_limb; - sp.hi_mask = 0; - } else { - sp.hi_limb = sp.lo_limb + 1; - sp.hi_mask = (uint32_t{ 1 } << hi_bits) - 1; - } - // Fast path: the full (window_bits+1)-bit window lives inside `lo_limb`. hi_bits == 0 - // captures both the in-limb case (window doesn't straddle a 64-bit boundary) and the - // clamped top-window case (above) where hi_mask was forced to 0. - sp.slice_localised_to_one_u64 = (hi_bits == 0); - } - return sp; -} - -/** - * @brief Read (window_bits+1) bits from `scalar_data` (uint64 limbs) using precomputed - * slice params and apply Constantine's signedWindowEncoding to produce a - * `(sign | bucket)` packed digit. - * - * Takes the slice params as scalar value parameters rather than a struct reference - * so the compiler can keep them in registers across the caller loop. - * - * `slice_localised_to_one_u64` selects the single-load fast path: ~75% of windows - * on typical 254-bit scalars (window_bits in [12, 19]) hit this. - */ -[[nodiscard]] [[gnu::always_inline]] inline uint32_t get_constantine_packed_digit(const uint64_t* scalar_data, - uint32_t lo_limb, - uint32_t hi_limb, - uint32_t lo_off, - uint32_t lo_bits, - uint32_t lo_mask, - uint32_t hi_mask, - bool slice_localised_to_one_u64, - size_t window_bits) noexcept -{ - uint64_t raw_wide = 0; - if (slice_localised_to_one_u64) { - // Fast path: one load + shift + mask. hi_part vanishes (hi_mask == 0); skip it. - raw_wide = (scalar_data[lo_limb] >> lo_off) & lo_mask; - } else if (lo_mask == 0) { - // Bottom-window fast path: synthetic-zero lookback bit, so the lo_part contribution is - // always 0 (lo_mask == 0). Skip the lo limb load entirely. lo_bits == 1 here, so the - // shift plants the window_bits-bit slice at bits 1..window_bits with bit 0 = 0. - // sp_lo_mask is loop-invariant within a window but is a runtime stack value, so the - // compiler does NOT constant-fold the `(s_lo >> lo_off) & 0 = 0` path inside the - // boundary branch; this explicit check saves ~3 ALU ops per scalar on the bottom window. - raw_wide = (scalar_data[hi_limb] & hi_mask) << lo_bits; - } else { - // Slow path: window straddles a uint64 boundary. - const uint64_t s_lo = scalar_data[lo_limb]; - const uint64_t s_hi = scalar_data[hi_limb]; - const uint64_t lo_part = (s_lo >> lo_off) & lo_mask; - const uint64_t hi_part = (s_hi & hi_mask) << lo_bits; - raw_wide = lo_part | hi_part; - } - // raw fits in window_bits+1 ≤ 32 bits, safe to narrow. - const uint32_t raw = static_cast(raw_wide); - - // signedWindowEncoding(raw, window_bits). raw fits in window_bits+1 bits; bit - // `window_bits` is the sign indicator. - // - // The conditional-negate trick `((encode + neg_mask) ^ neg_mask)` is the standard - // branchless idiom. We use the equivalent `(encode - neg) ^ neg_mask` to break the - // latency chain: `encode - neg` and `neg_mask = -neg` can issue in parallel (both - // depend only on `neg` / `encode`), whereas `encode + neg_mask` first waits for - // `neg_mask` to materialise. Saves one cycle on the inner-loop critical path - // (neg → neg_mask → +neg_mask → ^neg_mask → &val_mask vs neg → {neg_mask, enc_neg} - // in parallel → ^neg_mask → &val_mask). Identical result by: - // neg=0: enc_neg = encode, xored = encode ^ 0 = encode. ✓ - // neg=1: enc_neg = encode−1, xored = (encode−1) ^ −1 = ~(encode−1) = −encode. ✓ - const uint32_t neg = (raw >> window_bits) & uint32_t{ 1 }; - const uint32_t neg_mask = uint32_t{ 0 } - neg; // 0 or 0xFFFFFFFF - const uint32_t val_mask = (uint32_t{ 1 } << window_bits) - 1; - const uint32_t encode = (raw + 1) >> 1; - const uint32_t bucket_idx = ((encode - neg) ^ neg_mask) & val_mask; - - // Pack into (sign | bucket): sign in bit 31, bucket magnitude in the low bits. - return (neg << 31) | bucket_idx; -} - -// 128-bit SIMD-friendly 4-wide variant of get_constantine_packed_digit. Computes 4 packed -// digits in parallel via GCC's vector_size extension, which lowers to native SIMD on x86 -// (SSE2), ARM (NEON), and WASM (wasm-simd128). The branch on slice path is hoisted from -// the per-call site to the per-window outer loop, so callers select the localised / bottom / -// boundary specialisation once per window. -// -// We index the scalar via a `const uint32_t*` view rather than the natural `uint64_t*`: -// each lane is one uint32, so a 128-bit SIMD register holds 4 (raw, encode, bucket, …) -// values. `scalar.data` is a `std::array` whose byte layout is identical to -// `uint32_t[8]` on every target we ship to (x86 / ARM / WASM are all little-endian, and the -// codebase already assumes this layout in many places — `from_montgomery`, `uint256_t`, -// etc.). The reinterpret_cast is the same alias pattern. -// -// Returns the four packed digits in `out[0..3]`. The caller scatters them individually, -// since the consuming writes are not vectorisable. Switching from 2-wide uint64 to 4-wide -// uint32 doubles the compute throughput per SIMD instruction at the cost of slightly more -// straddle hits. -using SimdU32x4 = uint32_t __attribute__((vector_size(16))); - -// Helpers return `SimdU32x4` directly so the v128 stays in the SIMD register file end-to-end. -// Wrapping in a 4-uint32 struct round-tripped the v128 through 4 scalar memory slots. - -// uint32-indexed Constantine slice params, mirroring `ConstantineSliceParams` but with -// limb indices measured in 32-bit (rather than 64-bit) chunks. Computed once per window in -// `compute_constantine_slice_params_u32`; consumed by the SIMD x4 helpers below. -struct ConstantineSliceParamsU32 { - uint32_t lo_mask; - uint32_t hi_mask; - uint32_t lo_limb; // u32 limb index of the lookback bit - uint32_t hi_limb; // == lo_limb + 1, clamped to last in-range u32 limb at the top window - uint32_t lo_off; // bit-offset of the lookback bit within `lo_limb` - uint32_t lo_bits; // # bits read from `lo_limb` (also acts as the hi_part left-shift amount) - bool slice_localised_to_one_u32; - bool is_bottom_window; -}; - -[[nodiscard]] inline ConstantineSliceParamsU32 compute_constantine_slice_params_u32(size_t bit_offset, - size_t window_bits, - size_t num_u32_limbs) noexcept -{ - constexpr size_t LIMB_BITS_U32 = 32; - ConstantineSliceParamsU32 sp; - if (bit_offset == 0) { - sp.lo_limb = 0; - sp.hi_limb = 0; - sp.lo_off = LIMB_BITS_U32 - 1; - sp.lo_bits = 1; - sp.lo_mask = 0; - sp.hi_mask = (uint32_t{ 1 } << window_bits) - 1; - sp.slice_localised_to_one_u32 = false; - sp.is_bottom_window = true; - } else { - const size_t lookback_bit = bit_offset - 1; - const size_t bits_to_read = window_bits + 1; - sp.lo_limb = static_cast(lookback_bit / LIMB_BITS_U32); - sp.lo_off = static_cast(lookback_bit & (LIMB_BITS_U32 - 1)); - const uint32_t in_lo = static_cast(LIMB_BITS_U32 - sp.lo_off); - sp.lo_bits = (in_lo < static_cast(bits_to_read)) ? in_lo : static_cast(bits_to_read); - const uint32_t hi_bits = static_cast(bits_to_read) - sp.lo_bits; - sp.lo_mask = (sp.lo_bits == LIMB_BITS_U32) ? ~uint32_t{ 0 } : ((uint32_t{ 1 } << sp.lo_bits) - 1); - if (static_cast(sp.lo_limb) + 1 >= num_u32_limbs) { - sp.hi_limb = sp.lo_limb; - sp.hi_mask = 0; - } else { - sp.hi_limb = sp.lo_limb + 1; - sp.hi_mask = (uint32_t{ 1 } << hi_bits) - 1; - } - sp.slice_localised_to_one_u32 = (hi_bits == 0); - sp.is_bottom_window = false; - } - return sp; -} - -// Gather 4 disjoint uint32 values into one v128 via wasm v128.load32_lane. On WASM this -// is 1 splat + 3 load32_lane (4 ops); brace-init `{a, b, c, d}` with runtime values emits -// 4 scalar i32.load + 1 splat + 3 replace_lane (8 ops). On native it falls back to brace- -// init which clang lowers to NEON ins / SSE2 pinsrd. -[[nodiscard]] [[gnu::always_inline]] inline SimdU32x4 gather_x4_u32( - const uint32_t* p0, const uint32_t* p1, const uint32_t* p2, const uint32_t* p3, uint32_t idx) noexcept -{ -#ifdef __wasm_simd128__ - v128_t v = wasm_i32x4_splat(0); - v = wasm_v128_load32_lane(p0 + idx, v, 0); - v = wasm_v128_load32_lane(p1 + idx, v, 1); - v = wasm_v128_load32_lane(p2 + idx, v, 2); - v = wasm_v128_load32_lane(p3 + idx, v, 3); - return reinterpret_cast(v); -#else - return SimdU32x4{ p0[idx], p1[idx], p2[idx], p3[idx] }; -#endif -} - -// Store a `SimdU32x4` to a 4-lane uint32 destination as a single 128-bit op. -// On WASM the explicit `wasm_v128_store` is used because earlier codegen for -// the equivalent struct-wrapper assignment was observed to round-trip the -// vector through 4 scalar memory slots; the intrinsic guarantees the -// `i32x4.store` opcode. On native the `vector_size` store lowers directly to -// SSE2 `movdqu` / NEON `st1`. -[[gnu::always_inline]] inline void simd_u32x4_store(uint32_t* dst, SimdU32x4 v) noexcept -{ -#ifdef __wasm_simd128__ - wasm_v128_store(dst, reinterpret_cast(v)); -#else - *reinterpret_cast(dst) = v; -#endif -} - -// All four mask / constant v128s (lo_mask_v, hi_mask_v, one_v, val_mask) are loop-invariant -// within a window. Callers build them ONCE per window in the outer-w loop and pass them in, -// so the inner-i compute loop has zero v128.const / splat / shl+sub for the masks. -// `neg_mask = -neg` uses GCC vector-ext unary minus which lowers to `i32x4.neg` on WASM. -// -// Helpers write the v128 result directly into the caller-provided 4-lane destination buffer. -[[gnu::always_inline]] inline void store_constantine_packed_digits_x4_localised(uint32_t* dst, - const uint32_t* scalar_data_0, - const uint32_t* scalar_data_1, - const uint32_t* scalar_data_2, - const uint32_t* scalar_data_3, - uint32_t lo_limb, - uint32_t lo_off, - SimdU32x4 lo_mask_v, - SimdU32x4 one_v, - SimdU32x4 val_mask, - uint32_t window_bits) noexcept -{ - const SimdU32x4 lo = gather_x4_u32(scalar_data_0, scalar_data_1, scalar_data_2, scalar_data_3, lo_limb); - const SimdU32x4 raw = (lo >> lo_off) & lo_mask_v; - const SimdU32x4 neg = (raw >> window_bits) & one_v; - const SimdU32x4 neg_mask = -neg; - const SimdU32x4 encode = (raw + one_v) >> 1; - const SimdU32x4 bucket = ((encode - neg) ^ neg_mask) & val_mask; - const SimdU32x4 packed = (neg << 31) | bucket; - simd_u32x4_store(dst, packed); -} - -[[gnu::always_inline]] inline void store_constantine_packed_digits_x4_bottom(uint32_t* dst, - const uint32_t* scalar_data_0, - const uint32_t* scalar_data_1, - const uint32_t* scalar_data_2, - const uint32_t* scalar_data_3, - uint32_t hi_limb, - uint32_t lo_bits, - SimdU32x4 hi_mask_v, - SimdU32x4 one_v, - SimdU32x4 val_mask, - uint32_t window_bits) noexcept -{ - const SimdU32x4 hi = gather_x4_u32(scalar_data_0, scalar_data_1, scalar_data_2, scalar_data_3, hi_limb); - const SimdU32x4 raw = (hi & hi_mask_v) << lo_bits; - const SimdU32x4 neg = (raw >> window_bits) & one_v; - const SimdU32x4 neg_mask = -neg; - const SimdU32x4 encode = (raw + one_v) >> 1; - const SimdU32x4 bucket = ((encode - neg) ^ neg_mask) & val_mask; - const SimdU32x4 packed = (neg << 31) | bucket; - simd_u32x4_store(dst, packed); -} - -[[gnu::always_inline]] inline void store_constantine_packed_digits_x4_boundary(uint32_t* dst, - const uint32_t* scalar_data_0, - const uint32_t* scalar_data_1, - const uint32_t* scalar_data_2, - const uint32_t* scalar_data_3, - uint32_t lo_limb, - uint32_t hi_limb, - uint32_t lo_off, - uint32_t lo_bits, - SimdU32x4 lo_mask_v, - SimdU32x4 hi_mask_v, - SimdU32x4 one_v, - SimdU32x4 val_mask, - uint32_t window_bits) noexcept -{ - const SimdU32x4 lo = gather_x4_u32(scalar_data_0, scalar_data_1, scalar_data_2, scalar_data_3, lo_limb); - const SimdU32x4 hi = gather_x4_u32(scalar_data_0, scalar_data_1, scalar_data_2, scalar_data_3, hi_limb); - const SimdU32x4 lo_part = (lo >> lo_off) & lo_mask_v; - const SimdU32x4 hi_part = (hi & hi_mask_v) << lo_bits; - const SimdU32x4 raw = lo_part | hi_part; - const SimdU32x4 neg = (raw >> window_bits) & one_v; - const SimdU32x4 neg_mask = -neg; - const SimdU32x4 encode = (raw + one_v) >> 1; - const SimdU32x4 bucket = ((encode - neg) ^ neg_mask) & val_mask; - const SimdU32x4 packed = (neg << 31) | bucket; - simd_u32x4_store(dst, packed); -} - -// Path-selector enum used to dispatch on the SIMD specialisation once per window rather -// than once per scalar. -enum class ConstantineSlicePath : uint8_t { - Localised = 0, - Bottom = 1, - Boundary = 2, -}; - -[[nodiscard]] [[gnu::always_inline]] inline ConstantineSlicePath classify_slice_path_u32( - const ConstantineSliceParamsU32& sp) noexcept -{ - if (sp.is_bottom_window) { - return ConstantineSlicePath::Bottom; - } - if (sp.slice_localised_to_one_u32) { - return ConstantineSlicePath::Localised; - } - return ConstantineSlicePath::Boundary; -} - -} // namespace bb::scalar_multiplication::round_parallel_detail diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.test.cpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.test.cpp deleted file mode 100644 index 199166832bc7..000000000000 --- a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.test.cpp +++ /dev/null @@ -1,443 +0,0 @@ -// Unit tests for the Constantine signed-Booth window recoder used by the -// round-parallel Pippenger MSM. Validates the scalar packed-digit recoder, -// the SIMD x4 specialisations (Localised / Bottom / Boundary), and the -// round-trip identity `Σ_w (-1)^sign_w · bucket_w · 2^{B_w} ≡ scalar`. - -#include "pippenger_constantine.hpp" - -#include "barretenberg/ecc/curves/bn254/fr.hpp" -#include "barretenberg/numeric/random/engine.hpp" -#include "barretenberg/numeric/uint256/uint256.hpp" - -#include -#include -#include -#include - -namespace { - -namespace cnst = bb::scalar_multiplication::round_parallel_detail; -using ScalarField = bb::fr; -auto& engine = bb::numeric::get_randomness(); - -constexpr size_t LIMB_BITS_U64 = 64; -constexpr size_t NUM_LIMBS_U64 = 4; -constexpr size_t NUM_LIMBS_U32 = 8; -constexpr size_t MAX_BITS = 256; - -// ============================================================================= -// Reference signed-window encoder. Reads `(window_bits + 1)` bits from the -// scalar starting at `bit_offset - 1` (with a synthetic 0 at bit -1 when -// bit_offset == 0), then applies the signed-Booth encode: -// -// raw = bits [bit_offset-1, bit_offset + window_bits) -// neg = raw >> window_bits (top bit = sign indicator) -// encode = (raw + 1) >> 1 (drop the lookback bit) -// bucket = (encode - neg) ^ (-neg) (conditional negate, branchless) -// packed = (neg << 31) | bucket -// -// Same algebra as `get_constantine_packed_digit`, but implemented in the most -// obvious way against a flat `bit_at(i)` accessor so any error in the -// production path's limb-walking or branchless conditional negate will diverge. -// ============================================================================= -uint32_t reference_packed_digit(const uint64_t* scalar_data, size_t bit_offset, size_t window_bits) -{ - auto bit_at = [&](int64_t i) -> uint64_t { - if (i < 0 || static_cast(i) >= MAX_BITS) { - return 0; - } - return (scalar_data[static_cast(i) / LIMB_BITS_U64] >> (static_cast(i) % LIMB_BITS_U64)) & - uint64_t{ 1 }; - }; - uint32_t raw = 0; - for (size_t k = 0; k <= window_bits; ++k) { - const int64_t bit_idx = static_cast(bit_offset) + static_cast(k) - 1; - raw |= static_cast(bit_at(bit_idx)) << k; - } - const uint32_t neg = (raw >> window_bits) & 1U; - const uint32_t val_mask = (uint32_t{ 1 } << window_bits) - 1; - const uint32_t encode = (raw + 1) >> 1; - const uint32_t bucket = ((encode - neg) ^ (uint32_t{ 0 } - neg)) & val_mask; - return (neg << 31) | bucket; -} - -// Random non-Montgomery scalar — uniform over [0, modulus). We invoke the -// recoder against the raw limbs so the random_element form is irrelevant; what -// matters is that the limb bytes are arbitrary. -std::array random_scalar_limbs() -{ - std::array out{}; - for (size_t i = 0; i < NUM_LIMBS_U64; ++i) { - out[i] = engine.get_random_uint64(); - } - return out; -} - -// View the same scalar as a uint32 limb array (little-endian: x86/ARM/WASM all -// agree). The SIMD x4 helpers index by uint32 limbs. -const uint32_t* as_u32(const std::array& s) -{ - return reinterpret_cast(s.data()); -} - -// Drive `get_constantine_packed_digit` via the params returned by -// `compute_constantine_slice_params`. The hot loop in Stage 1 / Stage 4 unpacks -// the struct into scalar values; we mirror that call shape exactly so a future -// API change here would be caught. -uint32_t production_scalar_path(const uint64_t* scalar_data, size_t bit_offset, size_t window_bits) -{ - const auto sp = cnst::compute_constantine_slice_params(bit_offset, window_bits, NUM_LIMBS_U64); - return cnst::get_constantine_packed_digit(scalar_data, - sp.lo_limb, - sp.hi_limb, - sp.lo_off, - sp.lo_bits, - sp.lo_mask, - sp.hi_mask, - sp.slice_localised_to_one_u64, - window_bits); -} - -// Drive the 4-wide SIMD specialisations by classifying the slice path and -// calling the matching `store_constantine_packed_digits_x4_*` helper. Out[i] -// is the packed digit for the i-th scalar. Mirrors Stage 1's per-window -// dispatch loop in `scalar_multiplication.cpp`. -void production_simd_path(const std::array scalars[4], - size_t bit_offset, - size_t window_bits, - uint32_t out[4]) -{ - const auto sp = cnst::compute_constantine_slice_params_u32(bit_offset, window_bits, NUM_LIMBS_U32); - const cnst::SimdU32x4 lo_mask_v{ sp.lo_mask, sp.lo_mask, sp.lo_mask, sp.lo_mask }; - const cnst::SimdU32x4 hi_mask_v{ sp.hi_mask, sp.hi_mask, sp.hi_mask, sp.hi_mask }; - const cnst::SimdU32x4 one_v{ 1, 1, 1, 1 }; - const uint32_t val_mask_scalar = (uint32_t{ 1 } << window_bits) - 1; - const cnst::SimdU32x4 val_mask{ val_mask_scalar, val_mask_scalar, val_mask_scalar, val_mask_scalar }; - - const uint32_t* s0 = as_u32(scalars[0]); - const uint32_t* s1 = as_u32(scalars[1]); - const uint32_t* s2 = as_u32(scalars[2]); - const uint32_t* s3 = as_u32(scalars[3]); - - const uint32_t wb_u32 = static_cast(window_bits); - switch (cnst::classify_slice_path_u32(sp)) { - case cnst::ConstantineSlicePath::Localised: - cnst::store_constantine_packed_digits_x4_localised( - out, s0, s1, s2, s3, sp.lo_limb, sp.lo_off, lo_mask_v, one_v, val_mask, wb_u32); - break; - case cnst::ConstantineSlicePath::Bottom: - cnst::store_constantine_packed_digits_x4_bottom( - out, s0, s1, s2, s3, sp.hi_limb, sp.lo_bits, hi_mask_v, one_v, val_mask, wb_u32); - break; - case cnst::ConstantineSlicePath::Boundary: - cnst::store_constantine_packed_digits_x4_boundary(out, - s0, - s1, - s2, - s3, - sp.lo_limb, - sp.hi_limb, - sp.lo_off, - sp.lo_bits, - lo_mask_v, - hi_mask_v, - one_v, - val_mask, - wb_u32); - break; - } -} - -} // namespace - -// ============================================================================= -// Test 1 — Scalar packed-digit recoder matches the textbook reference oracle -// across all `(window_bits, bit_offset)` pairs the live pipeline ever issues. -// ============================================================================= -TEST(PippengerConstantine, ScalarMatchesReferenceOracleAllWindowBits) -{ - constexpr size_t TRIALS_PER_SHAPE = 32; - // window_bits range covers production: choose_window_bits returns 2..19, - // build_var_window_schedule's final window can additionally be 1 bit wide - // (e.g. wb=3 over 256 bits yields 85*3 + 1). bit_offset 255 covers the - // above-modulus top edge where every read bit is structurally zero. - for (size_t window_bits = 1; window_bits <= 19; ++window_bits) { - for (size_t bit_offset = 0; bit_offset <= 255; ++bit_offset) { - for (size_t t = 0; t < TRIALS_PER_SHAPE; ++t) { - const auto s = random_scalar_limbs(); - const uint32_t got = production_scalar_path(s.data(), bit_offset, window_bits); - const uint32_t want = reference_packed_digit(s.data(), bit_offset, window_bits); - ASSERT_EQ(got, want) << "window_bits=" << window_bits << " bit_offset=" << bit_offset << " trial=" << t; - } - } - } -} - -// ============================================================================= -// Test 2 — SIMD x4 path agrees with the scalar path lane-by-lane across all -// three specialisations (Localised / Bottom / Boundary). Each bit_offset -// implicitly selects which specialisation runs; we sweep every offset so all -// three are exercised. -// ============================================================================= -TEST(PippengerConstantine, SimdX4MatchesScalarPathLanewise) -{ - constexpr size_t TRIALS_PER_SHAPE = 16; - bool saw_localised = false; - bool saw_bottom = false; - bool saw_boundary = false; - // window_bits range covers production: choose_window_bits returns 2..19, - // build_var_window_schedule's final window can additionally be 1 bit wide - // (e.g. wb=3 over 256 bits yields 85*3 + 1). bit_offset 255 covers the - // above-modulus top edge where every read bit is structurally zero. - for (size_t window_bits = 1; window_bits <= 19; ++window_bits) { - for (size_t bit_offset = 0; bit_offset <= 255; ++bit_offset) { - const auto sp_u32 = cnst::compute_constantine_slice_params_u32(bit_offset, window_bits, NUM_LIMBS_U32); - switch (cnst::classify_slice_path_u32(sp_u32)) { - case cnst::ConstantineSlicePath::Localised: - saw_localised = true; - break; - case cnst::ConstantineSlicePath::Bottom: - saw_bottom = true; - break; - case cnst::ConstantineSlicePath::Boundary: - saw_boundary = true; - break; - } - for (size_t t = 0; t < TRIALS_PER_SHAPE; ++t) { - std::array, 4> scalars{ - random_scalar_limbs(), random_scalar_limbs(), random_scalar_limbs(), random_scalar_limbs() - }; - std::array got_simd{}; - production_simd_path(scalars.data(), bit_offset, window_bits, got_simd.data()); - for (size_t lane = 0; lane < 4; ++lane) { - const uint32_t want = production_scalar_path(scalars[lane].data(), bit_offset, window_bits); - ASSERT_EQ(got_simd[lane], want) - << "window_bits=" << window_bits << " bit_offset=" << bit_offset << " lane=" << lane; - } - } - } - } - // The sweep must exercise all three specialisations or the SIMD coverage is - // a no-op for a path. (Coverage check, not a behavioural claim.) - EXPECT_TRUE(saw_localised); - EXPECT_TRUE(saw_bottom); - EXPECT_TRUE(saw_boundary); -} - -// ============================================================================= -// Test 3 — Round-trip identity. For any tiled window schedule covering -// [0, total_bits) bits, the sum `Σ_w (-1)^sign_w · bucket_w · 2^{B_w}` must -// equal the scalar value modulo 2^total_bits. This is the load-bearing -// algebraic invariant the whole MSM rests on; if it ever fails the rest of -// the pipeline silently mis-computes the result. -// ============================================================================= -TEST(PippengerConstantine, RoundTripIdentityMatchesScalarMod2N) -{ - constexpr size_t TOTAL_BITS = 254; - constexpr size_t TRIALS = 64; - // Including window_bits == 1 because `build_var_window_schedule` truncates - // the final window to whatever bits remain, which can be exactly 1. - for (size_t window_bits = 1; window_bits <= 19; ++window_bits) { - for (size_t t = 0; t < TRIALS; ++t) { - const auto s = random_scalar_limbs(); - // Recover scalar value as a 256-bit big integer (4 × uint64). - // We reconstruct it limb-by-limb using __int128 arithmetic so the - // round-trip is plainly readable; production code uses field - // arithmetic, which we deliberately avoid here. - // - // Tile windows of width `window_bits` until we cover TOTAL_BITS+2 - // bits. The +2 mirrors the `total_bits = num_bits + 2` budget used - // by `build_var_window_schedule` to absorb the carry-less top bit. - std::vector> signed_digits; // (signed_value, bit_offset) - size_t bit_offset = 0; - size_t bits_remaining = TOTAL_BITS + 2; - while (bits_remaining > 0) { - const size_t wb = std::min(window_bits, bits_remaining); - const uint32_t packed = production_scalar_path(s.data(), bit_offset, wb); - const uint32_t neg = packed >> 31; - const uint32_t bucket = packed & ((uint32_t{ 1 } << wb) - 1); - const int32_t signed_val = (neg != 0U) ? -static_cast(bucket) : static_cast(bucket); - signed_digits.emplace_back(signed_val, bit_offset); - bit_offset += wb; - bits_remaining -= wb; - } - - // Reconstruct: Σ_w signed_val_w · 2^{bit_offset_w} mod 2^256, using - // uint256_t arithmetic where signed subtraction is just `acc -= |v| << off`. - bb::numeric::uint256_t acc(0); - for (const auto& [v, off] : signed_digits) { - const bb::numeric::uint256_t shifted = bb::numeric::uint256_t(static_cast(v < 0 ? -v : v)) - << bb::numeric::uint256_t(off); - if (v < 0) { - acc -= shifted; - } else { - acc += shifted; - } - } - const bb::numeric::uint256_t scalar_val(s[0], s[1], s[2], s[3]); - EXPECT_EQ(acc, scalar_val) << "window_bits=" << window_bits << " trial=" << t; - } - } -} - -// ============================================================================= -// Test 4 — Edge cases. Pin the structural boundaries explicitly so a regression -// at one of them (rather than at a random bit) shows up as a named failure. -// ============================================================================= -TEST(PippengerConstantine, EdgeCases) -{ - // (a) Zero scalar — every packed digit must be 0 (sign 0, bucket 0). - // Sweep includes wb=1 (final-window truncation) and bit_offset=255 - // (above-modulus top edge — every bit read is structurally zero). - std::array zero{}; - for (size_t wb = 1; wb <= 19; ++wb) { - for (size_t off = 0; off <= 255; ++off) { - EXPECT_EQ(production_scalar_path(zero.data(), off, wb), uint32_t{ 0 }) - << "zero scalar wb=" << wb << " off=" << off; - } - } - - // (b) Bottom window — bit_offset == 0 must select the synthetic-zero - // lookback path. The classifier flags it via `is_bottom_window`. - const auto sp_bottom = cnst::compute_constantine_slice_params_u32(0, 12, NUM_LIMBS_U32); - EXPECT_TRUE(sp_bottom.is_bottom_window); - EXPECT_EQ(cnst::classify_slice_path_u32(sp_bottom), cnst::ConstantineSlicePath::Bottom); - - // (c) Top window — when the natural hi_limb read lands past the scalar's - // storage, the production code clamps `hi_limb` and zeros `hi_mask`. The - // packed digit must still match the reference oracle (which extends with - // zeros above bit 256). Sweep all the way to bit_offset=255 to cover the - // above-modulus case where every read bit is structurally zero. - auto top_aligned = random_scalar_limbs(); - constexpr size_t window_bits = 12; - for (size_t bit_offset = 240; bit_offset <= 255; ++bit_offset) { - const uint32_t got = production_scalar_path(top_aligned.data(), bit_offset, window_bits); - const uint32_t want = reference_packed_digit(top_aligned.data(), bit_offset, window_bits); - EXPECT_EQ(got, want) << "top window bit_offset=" << bit_offset; - } - - // (d) Localised fast path — the c+1-bit window must fit inside a single - // uint64 limb for the localised path to be selected. With window_bits=12 - // and bit_offset=10, the lookback bit is at limb 0, bit 9; the window - // spans bits 10..21 — all inside limb 0, so localised path fires. - const auto sp_local = cnst::compute_constantine_slice_params(10, 12, NUM_LIMBS_U64); - EXPECT_TRUE(sp_local.slice_localised_to_one_u64); - - // (e) Boundary case — when the window straddles a uint64 boundary the - // localised flag must be false. With window_bits=12 and bit_offset=60, - // the window spans bits 59..71 → crosses bit 63→64. - const auto sp_boundary = cnst::compute_constantine_slice_params(60, 12, NUM_LIMBS_U64); - EXPECT_FALSE(sp_boundary.slice_localised_to_one_u64); -} - -// ============================================================================= -// Test 5 — Named slice-shape table. Random sweeps probably hit every -// (limb_index, slice_path) combination, but a regression at one of these -// boundaries (e.g. "boundary across bit 31→32, lookback in lo half") shows up -// as a named failure here rather than an opaque "trial 17 of 32" log line. -// -// `bit_offset` here is the absolute bit position of the FIRST window bit; the -// lookback bit lives at `bit_offset - 1`. Each row pins the (bit_offset, wb) -// pair, the expected slice path under u32 indexing, and the expected -// localisation under u64 indexing. -// ============================================================================= -TEST(PippengerConstantine, NamedSliceShapes) -{ - struct ShapeCase { - const char* name; - size_t bit_offset; - size_t window_bits; - cnst::ConstantineSlicePath u32_path; - bool u64_localised; // expected `slice_localised_to_one_u64` - }; - // Picked so each row exercises a structurally distinct shape: - // - bottom_* : synthetic-lookback path - // - local_* : c+1 bits fit inside a single u64 limb (and matching u32) - // - boundary_* : window straddles a u64 or u32 limb boundary - // - top_clamped : hi_limb would land past scalar storage → clamp + zero mask - const std::array cases{ { - // Bottom — bit_offset 0 across several wb. - { "bottom_wb12", 0, 12, cnst::ConstantineSlicePath::Bottom, false }, - { "bottom_wb2", 0, 2, cnst::ConstantineSlicePath::Bottom, false }, - { "bottom_wb19", 0, 19, cnst::ConstantineSlicePath::Bottom, false }, - // Localised — lookback + window inside a single u32 (and therefore a single u64). - { "local_lo_u32", 10, 12, cnst::ConstantineSlicePath::Localised, true }, - // Localised in u64 but boundary in u32 — lookback at bit 30 (u32 limb 0), window spans bits 30..42 - // (crosses u32 bit 31→32) but stays inside u64 limb 0. - { "local_u64_boundary_u32", 31, 12, cnst::ConstantineSlicePath::Boundary, true }, - // Boundary across u64 bit 63→64. - { "boundary_u64_at_63", 60, 12, cnst::ConstantineSlicePath::Boundary, false }, - { "boundary_u64_at_127", 124, 12, cnst::ConstantineSlicePath::Boundary, false }, - { "boundary_u64_at_191", 188, 12, cnst::ConstantineSlicePath::Boundary, false }, - // Boundary at u32 bit 31→32 with lookback in low half. - { "boundary_u32_at_31", 30, 4, cnst::ConstantineSlicePath::Boundary, true }, - // Top window — clamp regime. With wb=12, bit_offset=246 reads bits 245..257; hi limb is past - // the scalar's 256-bit storage in u32 view (limb_index 7 is the last). - { "top_clamped_wb12", 246, 12, cnst::ConstantineSlicePath::Boundary, false }, - // wb=1 at the very top — the final-window case `build_var_window_schedule` can emit. - { "top_wb1_final", 254, 1, cnst::ConstantineSlicePath::Localised, true }, - // Random mid-scalar localised case as a "happy path" anchor. - { "local_mid_u64", 80, 12, cnst::ConstantineSlicePath::Localised, true }, - } }; - - auto s = random_scalar_limbs(); - for (const auto& c : cases) { - const auto sp_u32 = cnst::compute_constantine_slice_params_u32(c.bit_offset, c.window_bits, NUM_LIMBS_U32); - const auto sp_u64 = cnst::compute_constantine_slice_params(c.bit_offset, c.window_bits, NUM_LIMBS_U64); - EXPECT_EQ(cnst::classify_slice_path_u32(sp_u32), c.u32_path) << "case=" << c.name; - EXPECT_EQ(sp_u64.slice_localised_to_one_u64, c.u64_localised) << "case=" << c.name; - - // The encoder must still produce the reference value at each named shape. - const uint32_t got = production_scalar_path(s.data(), c.bit_offset, c.window_bits); - const uint32_t want = reference_packed_digit(s.data(), c.bit_offset, c.window_bits); - EXPECT_EQ(got, want) << "case=" << c.name; - } -} - -// ============================================================================= -// Test 6 — u64 / u32 param classifier internal consistency. -// -// The scalar path uses `ConstantineSliceParams` (u64-indexed); the SIMD path -// uses `ConstantineSliceParamsU32` (u32-indexed). Comparing final packed -// digits (Test 1+2) catches END-to-END divergence, but a compensating bug -// across the two param computations could mask itself. This test asserts the -// param structs encode the SAME lookback bit position and the SAME read width -// where their definitions agree, so a bug in one classifier alone shows up -// even if the digits happen to round-trip. -// ============================================================================= -TEST(PippengerConstantine, ParamClassifierU64U32Consistency) -{ - for (size_t wb = 1; wb <= 19; ++wb) { - for (size_t bit_offset = 0; bit_offset <= 255; ++bit_offset) { - const auto sp_u64 = cnst::compute_constantine_slice_params(bit_offset, wb, NUM_LIMBS_U64); - const auto sp_u32 = cnst::compute_constantine_slice_params_u32(bit_offset, wb, NUM_LIMBS_U32); - - // Bottom-window classification: both must agree (u64 signals via lo_mask==0, - // u32 via the explicit is_bottom_window flag). - const bool u64_says_bottom = (sp_u64.lo_mask == 0); - EXPECT_EQ(u64_says_bottom, sp_u32.is_bottom_window) - << "bottom classification disagrees at bit_offset=" << bit_offset << " wb=" << wb; - - // Lookback bit absolute position: lo_limb·LIMB_BITS + lo_off. Both views must - // identify the same absolute bit (skip bottom, where the lookback is synthetic - // and the limb/offset encoding is intentionally not a real position). - if (!sp_u32.is_bottom_window) { - const size_t u64_lookback = sp_u64.lo_limb * 64 + sp_u64.lo_off; - const size_t u32_lookback = sp_u32.lo_limb * 32 + sp_u32.lo_off; - EXPECT_EQ(u64_lookback, u32_lookback) - << "lookback bit disagrees at bit_offset=" << bit_offset << " wb=" << wb; - EXPECT_EQ(u64_lookback, bit_offset - 1) - << "lookback bit ≠ bit_offset-1 at bit_offset=" << bit_offset << " wb=" << wb; - } - - // Localised-flag implication: u64-localised means the whole c+1 window lives in - // one u64 limb. That does NOT imply u32-localised (window could still straddle - // a u32 boundary inside the same u64), but it DOES imply the u32 view's slice - // path is NOT Bottom (bit_offset > 0 cases only). - if (sp_u64.slice_localised_to_one_u64 && bit_offset > 0) { - EXPECT_NE(cnst::classify_slice_path_u32(sp_u32), cnst::ConstantineSlicePath::Bottom) - << "u64-localised but u32 classifier says Bottom at bit_offset=" << bit_offset << " wb=" << wb; - } - } - } -}