Skip to content

Commit 64f5310

Browse files
authored
feat: merge-train/barretenberg (#23766)
BEGIN_COMMIT_OVERRIDE feat: extract Constantine signed-Booth window recoder with tests + fuzzer (#23562) END_COMMIT_OVERRIDE
2 parents ff8944f + 5d1315b commit 64f5310

4 files changed

Lines changed: 1035 additions & 0 deletions

File tree

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
// Shared carry-less signed-Booth window slice parameters.
2+
//
3+
// Each window is a c-bit signed digit in [-2^(c-1), 2^(c-1)], read as a (c+1)-bit
4+
// slice that overlaps its lower neighbour by one bit; the shared boundary bit
5+
// substitutes for an explicit carry. This is the algorithm Constantine calls
6+
// `signedWindowEncoding` / `getSignedFullWindowAt`
7+
// (constantine/math/arithmetic/bigints.nim).
8+
//
9+
// The struct + `compute_booth_slice_params` live here so they can be shared
10+
// between:
11+
// * `ecc/groups/element_impl.hpp` — the GLV-endo straus path uses a small
12+
// fixed-window (c=4, 32 windows) Booth recoding;
13+
// * `ecc/scalar_multiplication/pippenger_constantine.hpp` — the round-parallel
14+
// Pippenger MSM uses the same recoding at runtime-chosen window sizes.
15+
// The two callers diverge on the packed-digit reader (perf-tuned multi-path +
16+
// SIMD x4 in MSM; simple branchless in element_impl) — only the slice-param
17+
// computation is shared.
18+
19+
#pragma once
20+
21+
#include <cstddef>
22+
#include <cstdint>
23+
24+
namespace bb::ecc::booth {
25+
26+
/**
27+
* @brief Per-window precomputed slice parameters for the carry-less signed-Booth
28+
* window recoding. Read out by the per-(point, window) hot loop as two i32
29+
* loads + a fixed bit-twiddle (no branches, no per-iter address arithmetic).
30+
*
31+
* `slice_localised_to_one_u64`: true iff every bit of the (c+1)-bit window lives
32+
* inside a single uint64 limb. Callers that have a single-load fast path branch on
33+
* this; callers that don't can ignore it (the field is one bool — zero cost).
34+
*/
35+
struct BoothSliceParams {
36+
uint32_t lo_mask;
37+
uint32_t hi_mask;
38+
uint32_t lo_limb;
39+
uint32_t hi_limb; // == lo_limb + 1, except clamped to last valid limb at the top window
40+
uint32_t lo_off;
41+
uint32_t lo_bits;
42+
bool slice_localised_to_one_u64;
43+
};
44+
45+
/**
46+
* @brief Compute the Booth slice params for a window starting at absolute bit
47+
* position `bit_offset`. The slice is `[bit_offset - 1, bit_offset + window_bits)`;
48+
* the bit at `bit_offset - 1` is the shared boundary bit. The bottom window
49+
* (`bit_offset == 0`) is encoded specially so the same recoding algebra
50+
* applies — read "limb -1" as a zero-masked load.
51+
*
52+
* `constexpr` so callers with compile-time window schedules
53+
* (`element_impl`'s GLV-endo 32-window table) can materialise the param
54+
* array at compile time, while runtime-schedule callers (Pippenger) use
55+
* the same function at runtime.
56+
*/
57+
[[nodiscard]] constexpr BoothSliceParams compute_booth_slice_params(size_t bit_offset,
58+
size_t window_bits,
59+
size_t num_uint64_limbs) noexcept
60+
{
61+
constexpr size_t LIMB_BITS = 64;
62+
BoothSliceParams sp{};
63+
if (bit_offset == 0) {
64+
// Bottom window: the boundary bit below the LSB is a synthetic 0. Encode this by
65+
// reading "limb -1" as a zero-masked load (lo_mask = 0), then reading window_bits
66+
// bits from limb 0 into the hi side and shifting them left by 1. This puts the
67+
// window_bits-bit window at bits 1..window_bits with bit 0 = 0, matching the inner-
68+
// loop body used by every other window. Not localised — the synthetic-lookback
69+
// assembly only works in the slow path.
70+
sp.lo_limb = 0; // safe in-range, but masked to 0
71+
sp.hi_limb = 0; // = scalar limb 0
72+
sp.lo_off = LIMB_BITS - 1;
73+
sp.lo_bits = 1; // shifts hi_part left by 1, planting the window_bits-bit window at bits 1..window_bits
74+
sp.lo_mask = 0; // lo_part contributes nothing
75+
sp.hi_mask = (uint32_t{ 1 } << window_bits) - 1;
76+
sp.slice_localised_to_one_u64 = false;
77+
} else {
78+
const size_t lookback_bit = bit_offset - 1;
79+
const size_t bits_to_read = window_bits + 1;
80+
sp.lo_limb = static_cast<uint32_t>(lookback_bit / LIMB_BITS);
81+
sp.lo_off = static_cast<uint32_t>(lookback_bit & (LIMB_BITS - 1));
82+
sp.lo_bits = static_cast<uint32_t>(LIMB_BITS - sp.lo_off < bits_to_read ? LIMB_BITS - sp.lo_off : bits_to_read);
83+
const uint32_t hi_bits = static_cast<uint32_t>(bits_to_read) - sp.lo_bits;
84+
// window_bits+1 ≤ 32 for our windows ⇒ lo_bits ≤ 32 ⇒ mask fits in uint32.
85+
sp.lo_mask = (uint32_t{ 1 } << sp.lo_bits) - 1;
86+
// If the natural hi-limb read would land past the end of the scalar's storage,
87+
// clamp `hi_limb` to a safe in-range index and mask its contribution to zero. The
88+
// top window's hi_bits worth of bits are conceptually zero (scalar < 2^num_bits ≤
89+
// num_windows·window_bits). Re-reading lo_limb under a zero mask keeps the slow
90+
// path's two unconditional limb loads branch-free.
91+
if (static_cast<size_t>(sp.lo_limb) + 1 >= num_uint64_limbs) {
92+
sp.hi_limb = sp.lo_limb;
93+
sp.hi_mask = 0;
94+
} else {
95+
sp.hi_limb = sp.lo_limb + 1;
96+
sp.hi_mask = (uint32_t{ 1 } << hi_bits) - 1;
97+
}
98+
// Fast path: the full (window_bits+1)-bit window lives inside `lo_limb`. hi_bits == 0
99+
// captures both the in-limb case (window doesn't straddle a 64-bit boundary) and the
100+
// clamped top-window case (above) where hi_mask was forced to 0.
101+
sp.slice_localised_to_one_u64 = (hi_bits == 0);
102+
}
103+
return sp;
104+
}
105+
106+
} // namespace bb::ecc::booth
Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
// libFuzzer target for the Constantine signed-Booth window recoder.
2+
//
3+
// Two-pronged differential check on each input:
4+
// 1. Scalar path vs textbook reference oracle — catches encoder algebra bugs.
5+
// 2. SIMD x4 path vs scalar path (lane-by-lane) — catches lane-mux / mask /
6+
// vector-shift bugs in the three slice-path specialisations.
7+
//
8+
// Input layout: 1 byte window_bits ∈ [2, 18], 1 byte bit_offset ∈ [0, 254],
9+
// followed by 32 bytes × 4 = 128 bytes of scalar limb material. Total minimum
10+
// input = 130 bytes; smaller inputs are zero-padded so libFuzzer's empty-seed
11+
// kickoff still drives the encoder.
12+
//
13+
// Run:
14+
// cmake --preset fuzzing && cmake --build --preset fuzzing --target ecc_pippenger_constantine_fuzzer
15+
// ./build-fuzzing/bin/ecc_pippenger_constantine_fuzzer -max_total_time=60
16+
17+
#include "pippenger_constantine.hpp"
18+
19+
#include "barretenberg/numeric/uint256/uint256.hpp"
20+
21+
#include <array>
22+
#include <cstdint>
23+
#include <cstring>
24+
25+
namespace {
26+
27+
namespace cnst = bb::scalar_multiplication::round_parallel_detail;
28+
29+
constexpr size_t LIMB_BITS_U64 = 64;
30+
constexpr size_t NUM_LIMBS_U64 = 4;
31+
constexpr size_t NUM_LIMBS_U32 = 8;
32+
constexpr size_t MAX_BITS = 256;
33+
constexpr size_t SCALAR_BYTES = 32;
34+
35+
uint32_t reference_packed_digit(const uint64_t* scalar_data, size_t bit_offset, size_t window_bits)
36+
{
37+
auto bit_at = [&](int64_t i) -> uint64_t {
38+
if (i < 0 || static_cast<size_t>(i) >= MAX_BITS) {
39+
return 0;
40+
}
41+
return (scalar_data[static_cast<size_t>(i) / LIMB_BITS_U64] >> (static_cast<size_t>(i) % LIMB_BITS_U64)) &
42+
uint64_t{ 1 };
43+
};
44+
uint32_t raw = 0;
45+
for (size_t k = 0; k <= window_bits; ++k) {
46+
const int64_t bit_idx = static_cast<int64_t>(bit_offset) + static_cast<int64_t>(k) - 1;
47+
raw |= static_cast<uint32_t>(bit_at(bit_idx)) << k;
48+
}
49+
const uint32_t neg = (raw >> window_bits) & 1U;
50+
const uint32_t val_mask = (uint32_t{ 1 } << window_bits) - 1;
51+
const uint32_t encode = (raw + 1) >> 1;
52+
const uint32_t bucket = ((encode - neg) ^ (uint32_t{ 0 } - neg)) & val_mask;
53+
return (neg << 31) | bucket;
54+
}
55+
56+
uint32_t production_scalar(const uint64_t* scalar_data, size_t bit_offset, size_t window_bits)
57+
{
58+
const auto sp = cnst::compute_constantine_slice_params(bit_offset, window_bits, NUM_LIMBS_U64);
59+
return cnst::get_constantine_packed_digit(scalar_data,
60+
sp.lo_limb,
61+
sp.hi_limb,
62+
sp.lo_off,
63+
sp.lo_bits,
64+
sp.lo_mask,
65+
sp.hi_mask,
66+
sp.slice_localised_to_one_u64,
67+
window_bits);
68+
}
69+
70+
void production_simd(const std::array<std::array<uint64_t, NUM_LIMBS_U64>, 4>& scalars,
71+
size_t bit_offset,
72+
size_t window_bits,
73+
std::array<uint32_t, 4>& out)
74+
{
75+
const auto sp = cnst::compute_constantine_slice_params_u32(bit_offset, window_bits, NUM_LIMBS_U32);
76+
const cnst::SimdU32x4 lo_mask_v{ sp.lo_mask, sp.lo_mask, sp.lo_mask, sp.lo_mask };
77+
const cnst::SimdU32x4 hi_mask_v{ sp.hi_mask, sp.hi_mask, sp.hi_mask, sp.hi_mask };
78+
const cnst::SimdU32x4 one_v{ 1, 1, 1, 1 };
79+
const uint32_t val_mask_scalar = (uint32_t{ 1 } << window_bits) - 1;
80+
const cnst::SimdU32x4 val_mask{ val_mask_scalar, val_mask_scalar, val_mask_scalar, val_mask_scalar };
81+
const auto* s0 = reinterpret_cast<const uint32_t*>(scalars[0].data());
82+
const auto* s1 = reinterpret_cast<const uint32_t*>(scalars[1].data());
83+
const auto* s2 = reinterpret_cast<const uint32_t*>(scalars[2].data());
84+
const auto* s3 = reinterpret_cast<const uint32_t*>(scalars[3].data());
85+
const auto wb_u32 = static_cast<uint32_t>(window_bits);
86+
87+
switch (cnst::classify_slice_path_u32(sp)) {
88+
case cnst::ConstantineSlicePath::Localised:
89+
cnst::store_constantine_packed_digits_x4_localised(
90+
out.data(), s0, s1, s2, s3, sp.lo_limb, sp.lo_off, lo_mask_v, one_v, val_mask, wb_u32);
91+
break;
92+
case cnst::ConstantineSlicePath::Bottom:
93+
cnst::store_constantine_packed_digits_x4_bottom(
94+
out.data(), s0, s1, s2, s3, sp.hi_limb, sp.lo_bits, hi_mask_v, one_v, val_mask, wb_u32);
95+
break;
96+
case cnst::ConstantineSlicePath::Boundary:
97+
cnst::store_constantine_packed_digits_x4_boundary(out.data(),
98+
s0,
99+
s1,
100+
s2,
101+
s3,
102+
sp.lo_limb,
103+
sp.hi_limb,
104+
sp.lo_off,
105+
sp.lo_bits,
106+
lo_mask_v,
107+
hi_mask_v,
108+
one_v,
109+
val_mask,
110+
wb_u32);
111+
break;
112+
}
113+
}
114+
115+
} // namespace
116+
117+
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size)
118+
{
119+
// Pad input to the minimum required length so empty / tiny seeds still
120+
// exercise the encoder against zero-extended scalars.
121+
constexpr size_t MIN_INPUT = 2 + (SCALAR_BYTES * 4);
122+
std::array<uint8_t, MIN_INPUT> buf{};
123+
std::memcpy(buf.data(), data, std::min(size, MIN_INPUT));
124+
125+
// window_bits ∈ [1, 19] — `choose_window_bits` returns [2,19]; the final
126+
// window emitted by `build_var_window_schedule` can additionally be 1 bit
127+
// (e.g. wb=3 over 256 bits = 85*3+1). Outside this range the encoder has
128+
// no well-defined behavior in production.
129+
const size_t window_bits = 1 + (buf[0] % 19);
130+
// bit_offset ∈ [0, 255] — the live pipeline's range, including the top
131+
// edge where bit_offset+wb extends past the scalar's 256 bits (production
132+
// code clamps `hi_limb` and zeros `hi_mask`).
133+
const size_t bit_offset = buf[1] & 0xff;
134+
135+
std::array<std::array<uint64_t, NUM_LIMBS_U64>, 4> scalars{};
136+
for (size_t lane = 0; lane < 4; ++lane) {
137+
std::memcpy(scalars[lane].data(), buf.data() + 2 + (lane * SCALAR_BYTES), SCALAR_BYTES);
138+
}
139+
140+
// Check 1: scalar path matches the textbook reference oracle.
141+
for (size_t lane = 0; lane < 4; ++lane) {
142+
const uint32_t got = production_scalar(scalars[lane].data(), bit_offset, window_bits);
143+
const uint32_t want = reference_packed_digit(scalars[lane].data(), bit_offset, window_bits);
144+
if (got != want) {
145+
__builtin_trap();
146+
}
147+
}
148+
149+
// Check 2: SIMD x4 path agrees with scalar path lane-by-lane.
150+
std::array<uint32_t, 4> simd_out{};
151+
production_simd(scalars, bit_offset, window_bits, simd_out);
152+
for (size_t lane = 0; lane < 4; ++lane) {
153+
const uint32_t want = production_scalar(scalars[lane].data(), bit_offset, window_bits);
154+
if (simd_out[lane] != want) {
155+
__builtin_trap();
156+
}
157+
}
158+
159+
return 0;
160+
}

0 commit comments

Comments
 (0)