Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
1f5cb52
mt selector loop
ledwards2225 Mar 30, 2026
532b846
mt compute paerm mapping
ledwards2225 Mar 30, 2026
32dfc62
PoC for multi-cursor approach w nontrivial tests
ledwards2225 Apr 1, 2026
e3a82b9
move thread orchestration to builder from tests
ledwards2225 Apr 2, 2026
2bb0433
move build_constraints_parallel to acir format, update tests
ledwards2225 Apr 2, 2026
7da5c25
attempting full pipeline reveals that threade circuits wont be identi…
ledwards2225 Apr 2, 2026
c46e8ec
prove sequential = N=1 = N=2
ledwards2225 Apr 3, 2026
7fa09a6
add cursor for ram/rom
ledwards2225 Apr 3, 2026
91639b3
different size range constraints have different predetermined stucuttre
ledwards2225 Apr 3, 2026
78ae39f
handle diff size big quad constraints
ledwards2225 Apr 3, 2026
81b11c6
handle logic and aes
ledwards2225 Apr 3, 2026
ce2ec0f
blake, poseidon, EC + deferred copy constraints + stronger semantic e…
ledwards2225 Apr 3, 2026
fa713b9
create tasks in sequential order to avoid lookup table id discrepenci…
ledwards2225 Apr 3, 2026
2911c0b
defer caching nnf ops to avoid race
ledwards2225 Apr 3, 2026
2b68b8c
clear out debugging code
ledwards2225 Apr 3, 2026
a622eb5
unify and clean up
ledwards2225 Apr 3, 2026
75de6aa
cleanup
ledwards2225 Apr 3, 2026
af5aa42
revert unrelated threading changes
ledwards2225 Apr 3, 2026
d4ee170
template over builder for Mega tests
ledwards2225 Apr 4, 2026
4889b79
seems to be working for Mega arithmetized Ultra verifier op codes
ledwards2225 Apr 5, 2026
839bb27
ultra shows discrepencies for some reason
ledwards2225 Apr 5, 2026
f64a72c
add PoC doc
ledwards2225 Apr 10, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
404 changes: 404 additions & 0 deletions barretenberg/cpp/src/barretenberg/dsl/acir_format/acir_format.cpp

Large diffs are not rendered by default.

18 changes: 18 additions & 0 deletions barretenberg/cpp/src/barretenberg/dsl/acir_format/acir_format.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,4 +162,22 @@ Builder create_circuit(AcirProgram& program, const ProgramMetadata& metadata = P
template <typename Builder>
void build_constraints(Builder& builder, AcirFormat& constraints, const ProgramMetadata& metadata);

/**
* @brief Parallel variant of build_constraints for UltraCircuitBuilder.
* @details Processes each constraint type's instances in parallel using execute_parallel.
* For each type with N instances: runs 1 warmup instance sequentially, measures per-instance
* block sizes, then processes remaining N-1 instances across num_threads threads.
* Produces a bit-identical circuit to build_constraints.
*
* @param builder Must be constructed with the witness and public inputs already set
* @param constraints The ACIR constraints to process
* @param metadata Program metadata
* @param num_threads Number of threads for parallel execution
*/
template <typename Builder>
void build_constraints_parallel(Builder& builder,
AcirFormat& constraints,
const ProgramMetadata& metadata,
size_t num_threads);

} // namespace acir_format

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,19 @@

namespace bb {

// Thread-local index for parallel circuit construction. Used by Selector, ExecutionTraceBlock,
// and CircuitBuilderBase to route operations through per-thread cursors.
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
inline thread_local size_t parallel_thread_idx = 0;
inline void set_parallel_thread_index(size_t idx)
{
parallel_thread_idx = idx;
}
inline size_t get_parallel_thread_index()
{
return parallel_thread_idx;
}

#ifdef CHECK_CIRCUIT_STACKTRACES
struct BbStackTrace : backward::StackTrace {
BbStackTrace() { load_here(32); }
Expand Down Expand Up @@ -119,6 +132,46 @@ template <typename FF> class Selector {
* @brief Release all memory held by this selector.
*/
virtual void free_memory() {}

/**
* @brief Enable cursor mode for a specific thread.
* @details Used for parallel circuit construction where blocks are pre-allocated and threads write at
* pre-determined offsets. The underlying storage must already be sized to accommodate the writes.
* Thread index is set via set_active_thread_index() before processing opcodes.
*/
void enable_cursor_mode(size_t thread_idx, size_t start)
{
if (thread_idx >= cursors_.size()) {
cursors_.resize(thread_idx + 1, CURSOR_DISABLED);
}
cursors_[thread_idx] = start;
}

// Legacy single-thread interface (uses thread index 0)
void enable_cursor_mode(size_t start) { enable_cursor_mode(0, start); }

void disable_cursor_mode(size_t thread_idx)
{
if (thread_idx < cursors_.size()) {
cursors_[thread_idx] = CURSOR_DISABLED;
}
}
void disable_cursor_mode() { disable_cursor_mode(0); }

bool is_cursor_mode() const { return active_cursor() != CURSOR_DISABLED; }

size_t active_cursor() const
{
auto idx = get_parallel_thread_index();
return (cursors_.empty() || idx >= cursors_.size()) ? CURSOR_DISABLED : cursors_[idx];
}

size_t& active_cursor_ref() { return cursors_[get_parallel_thread_index()]; }

static constexpr size_t CURSOR_DISABLED = std::numeric_limits<size_t>::max();

protected:
std::vector<size_t> cursors_; // per-thread cursors
};

/**
Expand All @@ -134,13 +187,21 @@ template <typename FF> class ZeroSelector : public Selector<FF> {
void emplace_back(int value) override
{
BB_ASSERT_EQ(value, 0, "Calling ZeroSelector::emplace_back with a non zero value.");
size_++;
if (this->is_cursor_mode()) {
this->active_cursor_ref()++;
} else {
size_++;
}
}

void push_back(const FF& value) override
{
BB_ASSERT(value.is_zero());
size_++;
if (this->is_cursor_mode()) {
this->active_cursor_ref()++;
} else {
size_++;
}
}

void set(size_t, int) override { BB_ASSERT(false, "ZeroSelector::set should not be called"); }
Expand Down Expand Up @@ -179,8 +240,22 @@ template <typename FF> class SlabVectorSelector : public Selector<FF> {
public:
using Selector<FF>::emplace_back;

void emplace_back(int i) override { data.emplace_back(i); }
void push_back(const FF& value) override { data.push_back(value); }
void emplace_back(int i) override
{
if (this->is_cursor_mode()) {
data[this->active_cursor_ref()++] = i;
} else {
data.emplace_back(i);
}
}
void push_back(const FF& value) override
{
if (this->is_cursor_mode()) {
data[this->active_cursor_ref()++] = value;
} else {
data.push_back(value);
}
}
void set(size_t idx, int i) override { data[idx] = i; }
void set(size_t idx, const FF& value) override { data[idx] = value; }
void resize(size_t new_size) override { data.resize(new_size); }
Expand Down Expand Up @@ -246,6 +321,14 @@ template <typename FF, size_t NUM_WIRES_> class ExecutionTraceBlock {
size_t cached_size_ = 0; // set by free_data() so size() works after freeing
bool data_freed_ = false; // true after free_data() has been called
uint32_t trace_offset_ = std::numeric_limits<uint32_t>::max(); // where this block starts in the trace
std::vector<size_t> wire_cursors_; // per-thread wire cursors

size_t wire_active_cursor() const
{
auto idx = get_parallel_thread_index();
return (wire_cursors_.empty() || idx >= wire_cursors_.size()) ? Selector<FF>::CURSOR_DISABLED
: wire_cursors_[idx];
}

uint32_t trace_offset() const
{
Expand All @@ -257,6 +340,65 @@ template <typename FF, size_t NUM_WIRES_> class ExecutionTraceBlock {

size_t size() const { return data_freed_ ? cached_size_ : std::get<0>(this->wires).size(); }

/**
* @brief Get the index of the gate most recently written (via populate_wires).
* @details In cursor mode, populate_wires writes at the cursor and then increments it,
* so the last gate is at cursor - 1. In normal mode, it's size() - 1 as usual.
* Must be called immediately after populate_wires (before any other writes to this block).
*/
size_t last_gate_index() const
{
size_t wc = wire_active_cursor();
if (wc != Selector<FF>::CURSOR_DISABLED) {
return wc - 1;
}
return size() - 1;
}

/**
* @brief Get the index where the next gate will be written.
* @details In cursor mode, returns the current cursor position. In normal mode, returns size().
*/
size_t next_gate_index() const
{
size_t wc = wire_active_cursor();
if (wc != Selector<FF>::CURSOR_DISABLED) {
return wc;
}
return size();
}

/**
* @brief Enable cursor mode for a thread: subsequent gate writes go to position `start` and advance.
* @details The block's wires and selectors must already be sized to accommodate the writes.
* Used for parallel circuit construction where threads write to pre-allocated regions.
*/
void enable_cursor_mode(size_t thread_idx, size_t start)
{
if (thread_idx >= wire_cursors_.size()) {
wire_cursors_.resize(thread_idx + 1, Selector<FF>::CURSOR_DISABLED);
}
wire_cursors_[thread_idx] = start;
for (auto& sel : get_selectors()) {
sel.enable_cursor_mode(thread_idx, start);
}
}

// Legacy single-thread interface
void enable_cursor_mode(size_t start) { enable_cursor_mode(0, start); }

void disable_cursor_mode(size_t thread_idx)
{
if (thread_idx < wire_cursors_.size()) {
wire_cursors_[thread_idx] = Selector<FF>::CURSOR_DISABLED;
}
for (auto& sel : get_selectors()) {
sel.disable_cursor_mode(thread_idx);
}
}

void disable_cursor_mode() { disable_cursor_mode(0); }

#ifdef TRACY_HACK_GATES_AS_MEMORY
~ExecutionTraceBlock()
{
Expand Down Expand Up @@ -295,10 +437,19 @@ template <typename FF, size_t NUM_WIRES_> class ExecutionTraceBlock {
this->stack_traces.populate();
#endif
this->tracy_gate();
this->wires[0].emplace_back(idx_1);
this->wires[1].emplace_back(idx_2);
this->wires[2].emplace_back(idx_3);
this->wires[3].emplace_back(idx_4);
size_t wc = wire_active_cursor();
if (wc != Selector<FF>::CURSOR_DISABLED) {
this->wires[0][wc] = idx_1;
this->wires[1][wc] = idx_2;
this->wires[2][wc] = idx_3;
this->wires[3][wc] = idx_4;
wire_cursors_[get_parallel_thread_index()]++;
} else {
this->wires[0].emplace_back(idx_1);
this->wires[1].emplace_back(idx_2);
this->wires[2].emplace_back(idx_3);
this->wires[3].emplace_back(idx_4);
}
}

auto& w_l() { return std::get<0>(this->wires); };
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "barretenberg/ecc/curves/bn254/bn254.hpp"
#include "barretenberg/ecc/curves/bn254/fr.hpp"
#include "barretenberg/ecc/curves/grumpkin/grumpkin.hpp"
#include "barretenberg/honk/execution_trace/execution_trace_block.hpp" // for get_parallel_thread_index
#include "barretenberg/honk/execution_trace/gate_data.hpp"
#include "barretenberg/public_input_component/public_component_key.hpp"
#include "barretenberg/serialize/msgpack.hpp"
Expand Down Expand Up @@ -52,6 +53,44 @@ template <typename FF_> class CircuitBuilderBase {

size_t _num_gates = 0;

public:
// Cursor for parallel variable allocation. When enabled, add_variable writes at cursor position
// instead of appending. The variable vectors must be pre-sized to accommodate the writes.
static constexpr uint32_t VARIABLE_CURSOR_DISABLED = UINT32_MAX;

// Deferred assert_equal entries for parallel construction. In cursor mode, assert_equal calls
// are recorded per-task and replayed in deterministic task order after all threads join.
// This prevents nondeterministic union-find results when multiple threads assert_equal on
// the same shared ACIR witness.
struct DeferredAssertEqual {
uint32_t a_variable_idx;
uint32_t b_variable_idx;
std::string msg;
bool operator==(const DeferredAssertEqual&) const = default;
};
std::vector<std::vector<DeferredAssertEqual>> deferred_assert_equals_; // per-task

void init_deferred_assert_equal_buffers(size_t num_tasks) { deferred_assert_equals_.resize(num_tasks); }

// Set which task index the current thread is executing (for assert_equal deferral).
// Thread-local so concurrent threads don't overwrite each other's task index.
void set_current_task_index(size_t task_idx) { current_task_idx_ = task_idx; }
static inline thread_local size_t current_task_idx_ = 0;

void apply_deferred_assert_equals()
{
// Replay in task order (0, 1, 2, ...) for deterministic union-find results
for (auto& task_buf : deferred_assert_equals_) {
for (auto& entry : task_buf) {
assert_equal(entry.a_variable_idx, entry.b_variable_idx, entry.msg);
}
task_buf.clear();
}
}

private:
std::vector<uint32_t> variable_cursors_; // per-thread variable cursors

/**
* @brief Update all variables from index in equivalence class to have real variable new_real_index
* @param index The index of a variable in the class we're updating
Expand Down Expand Up @@ -144,6 +183,10 @@ template <typename FF_> class CircuitBuilderBase {
void increment_num_gates(size_t count = 1)
{
BB_ASSERT(!circuit_finalized, "Cannot add gates after circuit is finalized");
// In cursor mode, gate count is pre-computed; skip to avoid races in parallel construction
if (get_variable_cursor() != VARIABLE_CURSOR_DISABLED) {
return;
}
_num_gates += count;
}

Expand Down Expand Up @@ -188,6 +231,8 @@ template <typename FF_> class CircuitBuilderBase {
}

const std::vector<uint32_t>& public_inputs() const { return _public_inputs; };
const std::vector<uint32_t>& get_next_var_index() const { return next_var_index; }
const std::vector<uint32_t>& get_prev_var_index() const { return prev_var_index; }

/**
* @brief Set the _public_inputs_finalized to true to prevent any new public inputs from being added
Expand All @@ -211,6 +256,50 @@ template <typename FF_> class CircuitBuilderBase {
*/
virtual uint32_t add_variable(const FF& in);

/**
* @brief Enable variable cursor mode for parallel construction.
* @details When enabled, add_variable writes at the cursor position instead of appending.
* The variables/real_variable_index/next_var_index/prev_var_index/real_variable_tags vectors
* must be pre-sized to accommodate the writes.
*/
void enable_variable_cursor(size_t thread_idx, uint32_t start)
{
if (thread_idx >= variable_cursors_.size()) {
variable_cursors_.resize(thread_idx + 1, VARIABLE_CURSOR_DISABLED);
}
variable_cursors_[thread_idx] = start;
}
// Legacy single-thread interface
void enable_variable_cursor(uint32_t start) { enable_variable_cursor(0, start); }

void disable_variable_cursor(size_t thread_idx)
{
if (thread_idx < variable_cursors_.size()) {
variable_cursors_[thread_idx] = VARIABLE_CURSOR_DISABLED;
}
}
void disable_variable_cursor() { disable_variable_cursor(0); }

uint32_t get_variable_cursor() const
{
auto idx = get_parallel_thread_index();
return (variable_cursors_.empty() || idx >= variable_cursors_.size()) ? VARIABLE_CURSOR_DISABLED
: variable_cursors_[idx];
}

/**
* @brief Pre-allocate variable storage for parallel construction.
* @param total_size The total number of variables (existing + new from all threads).
*/
void resize_variables(size_t total_size)
{
variables.resize(total_size);
real_variable_index.resize(total_size);
next_var_index.resize(total_size);
prev_var_index.resize(total_size);
real_variable_tags.resize(total_size);
}

// Disallow add_variable for non-FF types to prevent implicit conversions (specifically, using indices rather
// than values)
template <typename OT> uint32_t add_variable(const OT& in) = delete;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,17 @@ void CircuitBuilderBase<FF_>::update_real_variable_indices(uint32_t index, uint3

template <typename FF_> uint32_t CircuitBuilderBase<FF_>::add_variable(const FF& in)
{
uint32_t cursor = get_variable_cursor();
if (cursor != VARIABLE_CURSOR_DISABLED) {
auto thread_idx = get_parallel_thread_index();
const uint32_t index = variable_cursors_[thread_idx]++;
variables[index] = in;
real_variable_index[index] = index;
next_var_index[index] = REAL_VARIABLE;
prev_var_index[index] = FIRST_VARIABLE_IN_CLASS;
real_variable_tags[index] = DEFAULT_TAG;
return index;
}
variables.emplace_back(in);
const uint32_t index = static_cast<uint32_t>(variables.size()) - 1U;
real_variable_index.emplace_back(index);
Expand Down Expand Up @@ -114,6 +125,13 @@ void CircuitBuilderBase<FF>::assert_equal(const uint32_t a_variable_idx,
const uint32_t b_variable_idx,
std::string const& msg)
{
// In cursor mode, defer assert_equal to avoid nondeterministic union-find results
// when multiple threads modify chains rooted at the same shared witness.
// Deferred entries are stored per-task and replayed in task order after all threads join.
if (get_variable_cursor() != VARIABLE_CURSOR_DISABLED) {
deferred_assert_equals_[current_task_idx_].push_back({ a_variable_idx, b_variable_idx, msg });
return;
}
assert_valid_variables({ a_variable_idx, b_variable_idx });
bool values_equal = (get_variable(a_variable_idx) == get_variable(b_variable_idx));
if (!values_equal && !failed()) {
Expand Down
Loading
Loading