Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions bench/experiments.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,19 @@
- PGO is a build-time optimization: it changes how LLVM compiles the code, not what code runs
- Could be integrated into CI for release builds or the prover docker image

## E8. Remove par_bridge from hot paths

**Status:** ✅ COMPLETE — PR #2729
**Hypothesis:** `par_bridge()` uses a mutex internally for work distribution, causing `Mutex::lock_contended` (1.3% in profile). Replacing with `par_chunks_mut` / `into_par_iter` removes the mutex.
**Expected delta:** 1-3%
**Actual delta:** -6.8% fib, -7.6% keccak, -7.1% big (on top of E1+E2)

### Changed files
- `slop/crates/multilinear/src/restrict.rs` — `mle_fix_last_variable`: `.chunks().zip().par_bridge()` → `.par_chunks_mut().enumerate()`
- `slop/crates/jagged/src/poly.rs` — `eval`: `.iter().zip().enumerate().par_bridge()` → `(0..N).into_par_iter()`
- `slop/crates/jagged/src/poly.rs` — `partial_jagged_little_polynomial_evaluation`: `.chunks_mut().enumerate().par_bridge()` → `.par_chunks_mut().enumerate()`
- `crates/hypercube/src/prover/zerocheck/sum_as_poly.rs` — `.chunks().zip().enumerate().par_bridge()` → `(0..N).into_par_iter()` with manual indexing

## ~~E7. SIMD feature gates~~

**Status:** ELIMINATED — AVX-512 confirmed active in Phase 1 profile
19 changes: 12 additions & 7 deletions crates/hypercube/src/prover/zerocheck/sum_as_poly.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use std::{
};

use itertools::Itertools;
use rayon::iter::{ParallelBridge, ParallelIterator};
use rayon::iter::{IntoParallelIterator, ParallelIterator};
use serde::{Deserialize, Serialize};
use slop_air::Air;
use slop_algebra::{
Expand Down Expand Up @@ -85,12 +85,17 @@ where
// Handle the case when the zerocheck polynomial has non-padded variables.
let eq_guts = eq_guts[0..num_non_padded_terms].to_vec();

let cumul_ys = eq_guts
.chunks(eq_chunk_size)
.zip(main_values.chunks(values_chunk_size * num_main_columns))
.enumerate()
.par_bridge()
.map(|(i, (eq_chunk, main_chunk))| {
let num_chunks = eq_guts.len().div_ceil(eq_chunk_size);
let cumul_ys = (0..num_chunks)
.into_par_iter()
.map(|i| {
let eq_start = i * eq_chunk_size;
let eq_end = (eq_start + eq_chunk_size).min(eq_guts.len());
let eq_chunk = &eq_guts[eq_start..eq_end];
let main_start = i * values_chunk_size * num_main_columns;
let main_end =
(main_start + values_chunk_size * num_main_columns).min(main_values.len());
let main_chunk = &main_values[main_start..main_end];
// Evaluate the constraint polynomial at the points 0, 2, and 4, and
// add the results to the y_0, y_2, and y_4 accumulators.
let mut cumul_y_0 = EF::zero();
Expand Down
74 changes: 31 additions & 43 deletions slop/crates/jagged/src/poly.rs
Original file line number Diff line number Diff line change
Expand Up @@ -200,32 +200,22 @@ impl<F: AbstractField + 'static + Send + Sync> JaggedLittlePolynomialVerifierPar
// Iterate over all columns. For each column, we need to know the total length of all the
// columns up to the current one, this number - 1, and the
// number of rows in the current column.
let mut branching_program_evals = Vec::with_capacity(self.col_prefix_sums.len() - 1);
#[allow(clippy::uninit_vec)]
unsafe {
branching_program_evals.set_len(self.col_prefix_sums.len() - 1);
}
let next_col_prefix_sums = self.col_prefix_sums.iter().skip(1);
let res = self
.col_prefix_sums
.iter()
.zip(next_col_prefix_sums)
.zip(branching_program_evals.iter_mut())
.enumerate()
.par_bridge()
.map(|(col_num, ((prefix_sum, next_prefix_sum), branching_program_eval))| {
// For `z_col` on the Boolean hypercube, this is the delta function to pick out
// the right column for the current index.
let num_cols = self.col_prefix_sums.len() - 1;
let res = (0..num_cols)
.into_par_iter()
.map(|col_num| {
let prefix_sum = &self.col_prefix_sums[col_num];
let next_prefix_sum = &self.col_prefix_sums[col_num + 1];
let z_col_correction = z_col_partial_lagrange[col_num].clone();

let prefix_sum_ef =
prefix_sum.iter().map(|x| EF::from(x.clone())).collect::<Point<EF>>();
let next_prefix_sum_ef =
next_prefix_sum.iter().map(|x| EF::from(x.clone())).collect::<Point<EF>>();
*branching_program_eval =
let branching_program_eval =
branching_program.eval(&prefix_sum_ef, &next_prefix_sum_ef);

z_col_correction.clone() * branching_program_eval.clone()
z_col_correction * branching_program_eval
})
.sum::<EF>();

Expand Down Expand Up @@ -278,32 +268,30 @@ impl JaggedLittlePolynomialProverParams {

let result_chunk_size = max(total_area / num_cpus::get(), 1);
tracing::debug_span!("compute jagged values").in_scope(|| {
(result.chunks_mut(result_chunk_size).enumerate().par_bridge()).for_each(
|(chunk_idx, chunk)| {
let i = chunk_idx * result_chunk_size;
let mut col_range_iter = col_ranges.get_col_range(i).peekable();
let mut current_col_range = col_range_iter.next().unwrap();
let mut current_row = i - current_col_range.start_i;

chunk.iter_mut().for_each(|val| {
*val = if current_col_range.is_last {
K::zero()
} else {
col_eq.guts().as_slice()[current_col_range.col_idx]
* row_eq.guts().as_slice()[current_row]
};

current_row += 1;
while current_row == current_col_range.col_size {
if col_range_iter.peek().is_none() {
break;
}
current_col_range = col_range_iter.next().unwrap();
current_row = 0;
result.par_chunks_mut(result_chunk_size).enumerate().for_each(|(chunk_idx, chunk)| {
let i = chunk_idx * result_chunk_size;
let mut col_range_iter = col_ranges.get_col_range(i).peekable();
let mut current_col_range = col_range_iter.next().unwrap();
let mut current_row = i - current_col_range.start_i;

chunk.iter_mut().for_each(|val| {
*val = if current_col_range.is_last {
K::zero()
} else {
col_eq.guts().as_slice()[current_col_range.col_idx]
* row_eq.guts().as_slice()[current_row]
};

current_row += 1;
while current_row == current_col_range.col_size {
if col_range_iter.peek().is_none() {
break;
}
});
},
);
current_col_range = col_range_iter.next().unwrap();
current_row = 0;
}
});
});
});

result.into()
Expand Down
34 changes: 15 additions & 19 deletions slop/crates/multilinear/src/restrict.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,29 +30,25 @@ where

let result_chunk_size =
max(num_non_zero_elements_out / num_cpus::get() * num_polynomials, num_polynomials);
let mle_chunk_size = 2 * result_chunk_size;
let mle_slice = mle.as_slice();

mle.as_slice()
.chunks(mle_chunk_size)
.zip(result.chunks_mut(result_chunk_size))
.par_bridge()
.for_each(|(mle_chunk, result_chunk)| {
let num_result_rows = result_chunk.len() / num_polynomials;
result.par_chunks_mut(result_chunk_size).enumerate().for_each(|(chunk_idx, result_chunk)| {
let mle_offset = chunk_idx * result_chunk_size * 2;
let num_result_rows = result_chunk.len() / num_polynomials;

(0..num_result_rows).for_each(|i| {
(0..num_polynomials).for_each(|j| {
let x = mle_chunk[(2 * i) * num_polynomials + j];
let y = mle_chunk
.get((2 * i + 1) * num_polynomials + j)
.copied()
.unwrap_or_else(|| padding_values[j]);
// return alpha * y + (EF::one() - alpha) * x, but in a more efficient
// way that minimizes extension field
// multiplications.
result_chunk[i * num_polynomials + j] = alpha * (y - x) + x;
});
(0..num_result_rows).for_each(|i| {
(0..num_polynomials).for_each(|j| {
let x = mle_slice[mle_offset + (2 * i) * num_polynomials + j];
let y = mle_slice
.get(mle_offset + (2 * i + 1) * num_polynomials + j)
.copied()
.unwrap_or_else(|| padding_values[j]);
// return alpha * y + (EF::one() - alpha) * x, but in a more efficient
// way that minimizes extension field multiplications.
result_chunk[i * num_polynomials + j] = alpha * (y - x) + x;
});
});
});

Tensor::from(result).reshape([num_non_zero_elements_out, num_polynomials])
}
Expand Down
Loading