diff --git a/barretenberg/cpp/src/barretenberg/dsl/acir_format/acir_format.cpp b/barretenberg/cpp/src/barretenberg/dsl/acir_format/acir_format.cpp index 15ec2cf72963..8bf61a0f2bfb 100644 --- a/barretenberg/cpp/src/barretenberg/dsl/acir_format/acir_format.cpp +++ b/barretenberg/cpp/src/barretenberg/dsl/acir_format/acir_format.cpp @@ -251,4 +251,408 @@ template <> MegaCircuitBuilder create_circuit(AcirProgram& program, const Progra template void build_constraints(UltraCircuitBuilder&, AcirFormat&, const ProgramMetadata&); template void build_constraints(MegaCircuitBuilder&, AcirFormat&, const ProgramMetadata&); +/** + * @brief Profile data for a constraint type, extracted from a throwaway builder. + * @details Eventually this will be a compile-time table lookup. For now, it's computed + * by running constraints on a throwaway builder and extracting the resulting state. + */ +template struct ConstraintProfile { + typename Builder::TaskBlockSizes block_sizes; + std::vector constants; // constant values to pre-register + std::vector range_list_targets; // range list target ranges to pre-create + std::vector table_ids; // lookup tables to pre-create + size_t num_rom_arrays_per_instance = 0; // ROM arrays created per constraint instance + size_t num_ram_arrays_per_instance = 0; // RAM arrays created per constraint instance + std::vector rom_array_sizes; // sizes of ROM arrays created per instance + std::vector ram_array_sizes; // sizes of RAM arrays created per instance +}; + +/** + * @brief Profile a constraint type by running it on a throwaway builder and extracting cache state. + * @details Runs two instances: the first triggers one-time setup, the second measures steady-state cost. + * Extracts all constants, range list targets, and lookup table IDs that the constraint type needs. + * This simulates the eventual table lookup. + */ +template +ConstraintProfile profile_constraint_type(ConstraintType representative, + Handler&& handler, + size_t num_witnesses) +{ + ConstraintProfile profile; + + // Phase A: Run one instance on a throwaway builder to discover setup needs (constants, range lists, etc.) + WitnessVector dummy_witness(num_witnesses, bb::fr(0)); + // Construct throwaway builder — Mega needs a default op_queue, Ultra uses the 3-arg constructor + auto make_builder = [&]() -> Builder { + if constexpr (std::is_same_v) { + return Builder(dummy_witness, {}, /*is_write_vk_mode=*/true); + } else { + return Builder(std::make_shared(), dummy_witness, {}, /*is_write_vk_mode=*/true); + } + }; + + Builder warmup_builder = make_builder(); + handler(warmup_builder, representative); + + // Extract setup data from the warmup builder + for (const auto& [value, _] : warmup_builder.constant_variable_indices) { + profile.constants.push_back(value); + } + for (const auto& [target_range, _] : warmup_builder.range_lists) { + profile.range_list_targets.push_back(target_range); + } + for (const auto& table : warmup_builder.get_lookup_tables()) { + profile.table_ids.push_back(table.id); + } + + // Phase B: Measure steady-state cost on a SEPARATE builder pre-populated with setup data. + // This ensures no cross-instance gate fusion at the boundary, matching cursor-mode behavior + // where each task starts with no prior gates in its block region. + Builder measure_builder = make_builder(); + for (const auto& value : profile.constants) { + measure_builder.put_constant_variable(value); + } + for (const auto target_range : profile.range_list_targets) { + if (measure_builder.range_lists.count(target_range) == 0) { + measure_builder.range_lists.insert({ target_range, measure_builder.create_range_list(target_range) }); + } + } + for (const auto table_id : profile.table_ids) { + measure_builder.get_table(table_id); + } + + auto before = measure_builder.snapshot_block_sizes(); + size_t rom_before = measure_builder.rom_ram_logic.rom_arrays.size(); + size_t ram_before = measure_builder.rom_ram_logic.ram_arrays.size(); + handler(measure_builder, representative); + auto after = measure_builder.snapshot_block_sizes(); + profile.block_sizes = Builder::delta(before, after); + + // Extract ROM/RAM array counts per instance + profile.num_rom_arrays_per_instance = measure_builder.rom_ram_logic.rom_arrays.size() - rom_before; + profile.num_ram_arrays_per_instance = measure_builder.rom_ram_logic.ram_arrays.size() - ram_before; + for (size_t i = rom_before; i < measure_builder.rom_ram_logic.rom_arrays.size(); i++) { + profile.rom_array_sizes.push_back(measure_builder.rom_ram_logic.rom_arrays[i].state.size()); + } + for (size_t i = ram_before; i < measure_builder.rom_ram_logic.ram_arrays.size(); i++) { + profile.ram_array_sizes.push_back(measure_builder.rom_ram_logic.ram_arrays[i].state.size()); + } + + return profile; +} + +/** + * @brief Prepare a builder's caches from constraint profiles WITHOUT running any constraints. + * @details Populates the builder's constant cache, range lists, and lookup tables using data + * extracted from profiles. After this, all parallel constraint execution will find everything + * cached — no cache misses, no one-time setup costs. + */ +template +void prepare_builder_from_profiles(Builder& builder, const std::vector>& profiles) +{ + // Register all constants from all profiles + for (const auto& profile : profiles) { + for (const auto& value : profile.constants) { + builder.put_constant_variable(value); + } + } + + // Create all needed range lists + for (const auto& profile : profiles) { + for (const auto target_range : profile.range_list_targets) { + if (builder.range_lists.count(target_range) == 0) { + builder.range_lists.insert({ target_range, builder.create_range_list(target_range) }); + } + } + } + + // Note: lookup tables are NOT created here. They are created in task order in Phase 2b + // so that table indices match sequential constraint processing order. +} + +template +void build_constraints_parallel(Builder& builder, + AcirFormat& constraints, + const ProgramMetadata& metadata, + size_t num_threads) +{ + using TaskBlockSizes = typename Builder::TaskBlockSizes; + size_t num_witnesses = constraints.max_witness_index + 1; + + // Phase 1: Profile each constraint type to build a map from grouping key to profile. + // Each constraint type has a key function that determines which instances share the same + // gate count profile. We profile one representative per unique key. + // + // Phase 1b: Collect tasks in the SAME ORDER as sequential build_constraints processes them. + // This ensures that lookup tables, ROM arrays, and other ordering-dependent state are created + // in an order that matches sequential, making the circuits identical up to gate reordering. + + // Use the UltraCircuitBuilder_ base type for task functions since execute_parallel is defined there. + // MegaCircuitBuilder inherits from UltraCircuitBuilder_, so this works for both. + using BaseBuilder = UltraCircuitBuilder_; + + std::vector> profiles; + std::vector> tasks; + std::vector task_sizes; + std::vector task_profile_indices; + + // Helper: profile unique keys in a constraint vector, then add tasks in vector order. + // Combines profiling and task collection in a single call per constraint type. + auto profile_and_collect = [&](auto& items, auto handler, auto key_fn) { + if (items.empty()) { + return; + } + using Key = decltype(key_fn(items[0])); + std::map key_to_profile; + // Phase 1: profile unique keys + for (size_t i = 0; i < items.size(); i++) { + Key k = key_fn(items[i]); + if (key_to_profile.count(k) == 0) { + auto profile = profile_constraint_type(items[i], handler, num_witnesses); + key_to_profile[k] = profiles.size(); + profiles.push_back(profile); + } + } + // Phase 1b: add tasks in vector order + for (size_t i = 0; i < items.size(); i++) { + size_t profile_idx = key_to_profile.at(key_fn(items[i])); + const auto& profile = profiles[profile_idx]; + auto sizes = profile.block_sizes; + sizes.num_rom_arrays = profile.num_rom_arrays_per_instance; + sizes.num_ram_arrays = profile.num_ram_arrays_per_instance; + tasks.emplace_back([handler, &items, i](BaseBuilder& b) { handler(static_cast(b), items[i]); }); + task_sizes.push_back(sizes); + task_profile_indices.push_back(profile_idx); + } + }; + + // For constraint types with no grouping (fixed gate count), the key is a constant. + auto const_key = [](const auto&) -> int { return 0; }; + + // Define key functions for each grouped type + auto big_quad_key = [](const BigQuadConstraint& c) -> size_t { return c.size(); }; + auto logic_key = [](const LogicConstraint& c) -> std::pair { + return { c.num_bits, c.is_xor_gate }; + }; + auto range_key = [](const RangeConstraint& c) -> uint32_t { return c.num_bits; }; + auto aes_key = [](const AES128Constraint& c) -> size_t { return c.inputs.size(); }; + auto blake2s_key = [](const Blake2sConstraint& c) -> size_t { return c.inputs.size(); }; + auto blake3_key = [](const Blake3Constraint& c) -> size_t { return c.inputs.size(); }; + auto pos2_key = [](const Poseidon2Constraint& c) -> size_t { return c.state.size(); }; + auto msm_key = [](const MultiScalarMul& c) -> std::vector { + std::vector key; + key.reserve(c.points.size() + c.scalars.size()); + for (const auto& p : c.points) + key.push_back(p.is_constant); + for (const auto& s : c.scalars) + key.push_back(s.is_constant); + return key; + }; + + // Define handlers + auto quad_handler = [](Builder& b, QuadConstraint& c) { create_quad_constraint(b, c); }; + auto big_quad_handler = [](Builder& b, BigQuadConstraint& c) { create_big_quad_constraint(b, c); }; + auto logic_handler = [](Builder& b, const LogicConstraint& c) { + create_logic_gate(b, c.a, c.b, c.result, c.num_bits, c.is_xor_gate); + }; + auto range_handler = [](Builder& b, const RangeConstraint& c) { + b.create_dyadic_range_constraint(c.witness, c.num_bits, "parallel range constraint"); + }; + auto aes_handler = [](Builder& b, const AES128Constraint& c) { create_aes128_constraints(b, c); }; + auto sha_handler = [](Builder& b, const Sha256Compression& c) { create_sha256_compression_constraints(b, c); }; + auto ecdsa_k1_handler = [](Builder& b, const EcdsaConstraint& c) { + create_ecdsa_verify_constraints>(b, c); + }; + auto ecdsa_r1_handler = [](Builder& b, const EcdsaConstraint& c) { + create_ecdsa_verify_constraints>(b, c); + }; + auto blake2s_handler = [](Builder& b, const Blake2sConstraint& c) { create_blake2s_constraints(b, c); }; + auto blake3_handler = [](Builder& b, const Blake3Constraint& c) { create_blake3_constraints(b, c); }; + auto keccak_handler = [](Builder& b, const Keccakf1600& c) { create_keccak_permutations_constraints(b, c); }; + auto pos2_handler = [](Builder& b, const Poseidon2Constraint& c) { + create_poseidon2_permutations_constraints(b, c); + }; + auto msm_handler = [](Builder& b, const MultiScalarMul& c) { create_multi_scalar_mul_constraint(b, c); }; + auto ec_add_handler = [](Builder& b, const EcAdd& c) { create_ec_add_constraint(b, c); }; + + // Profile and collect tasks in the same order as sequential build_constraints. + // Each call profiles unique keys, then adds tasks in constraint vector order. + profile_and_collect(constraints.quad_constraints, quad_handler, const_key); + profile_and_collect(constraints.big_quad_constraints, big_quad_handler, big_quad_key); + profile_and_collect(constraints.logic_constraints, logic_handler, logic_key); + profile_and_collect(constraints.range_constraints, range_handler, range_key); + profile_and_collect(constraints.aes128_constraints, aes_handler, aes_key); + profile_and_collect(constraints.sha256_compression, sha_handler, const_key); + profile_and_collect(constraints.ecdsa_k1_constraints, ecdsa_k1_handler, const_key); + profile_and_collect(constraints.ecdsa_r1_constraints, ecdsa_r1_handler, const_key); + profile_and_collect(constraints.blake2s_constraints, blake2s_handler, blake2s_key); + profile_and_collect(constraints.blake3_constraints, blake3_handler, blake3_key); + profile_and_collect(constraints.keccak_permutations, keccak_handler, const_key); + profile_and_collect(constraints.poseidon2_constraints, pos2_handler, pos2_key); + profile_and_collect(constraints.multi_scalar_mul_constraints, msm_handler, msm_key); + profile_and_collect(constraints.ec_add_constraints, ec_add_handler, const_key); + + // Recursion constraints are parallelized like other constraint types, but each task also + // captures a HonkRecursionConstraintOutput for post-join merging (needed for pairing point + // propagation and IPA finalization). + struct RecursionTaskInfo { + HonkRecursionConstraintOutput output; + bool update_ipa_data = false; + bool is_root_rollup = false; + }; + size_t num_rec_tasks = constraints.honk_recursion_constraints.size() + + constraints.chonk_recursion_constraints.size() + + constraints.avm_recursion_constraints.size(); + std::vector recursion_task_outputs(num_rec_tasks); + size_t rec_out_idx = 0; + + // Helper: execute a single honk recursion constraint based on proof_type + auto execute_honk_recursion = [](Builder& b, + const RecursionConstraint& c) -> HonkRecursionConstraintOutput { + if (c.proof_type == HONK_ZK) { + return create_honk_recursion_constraints, + stdlib::recursion::honk::DefaultIO>(b, c); + } else if (c.proof_type == HONK) { + return create_honk_recursion_constraints, + stdlib::recursion::honk::DefaultIO>(b, c); + } else { + // Rollup IO is only supported on UltraCircuitBuilder + if constexpr (std::is_same_v) { + return create_honk_recursion_constraints, + stdlib::recursion::honk::RollupIO>(b, c); + } else { + bb::assert_failure("Rollup Honk proof type not supported on MegaBuilder"); + return {}; + } + } + }; + + // Profiling handler (discards output — only used for measuring gate counts) + auto honk_rec_handler = [&execute_honk_recursion](Builder& b, const RecursionConstraint& c) { + execute_honk_recursion(b, c); + }; + auto honk_rec_key = [](const RecursionConstraint& c) -> uint32_t { return c.proof_type; }; + + // Profile honk recursion constraints by proof_type + std::map honk_rec_profiles; + for (size_t i = 0; i < constraints.honk_recursion_constraints.size(); i++) { + uint32_t k = honk_rec_key(constraints.honk_recursion_constraints[i]); + if (honk_rec_profiles.count(k) == 0) { + auto profile = profile_constraint_type( + constraints.honk_recursion_constraints[i], honk_rec_handler, num_witnesses); + honk_rec_profiles[k] = profiles.size(); + profiles.push_back(profile); + } + } + // Add honk recursion tasks in vector order with output capture + for (size_t i = 0; i < constraints.honk_recursion_constraints.size(); i++) { + const auto& c = constraints.honk_recursion_constraints[i]; + size_t profile_idx = honk_rec_profiles.at(c.proof_type); + const auto& profile = profiles[profile_idx]; + auto sizes = profile.block_sizes; + sizes.num_rom_arrays = profile.num_rom_arrays_per_instance; + sizes.num_ram_arrays = profile.num_ram_arrays_per_instance; + + size_t out_idx = rec_out_idx++; + recursion_task_outputs[out_idx].update_ipa_data = + (c.proof_type == ROLLUP_HONK || c.proof_type == ROOT_ROLLUP_HONK); + recursion_task_outputs[out_idx].is_root_rollup = (c.proof_type == ROOT_ROLLUP_HONK); + + tasks.emplace_back( + [&constraints, i, &execute_honk_recursion, &recursion_task_outputs, out_idx](BaseBuilder& b) { + recursion_task_outputs[out_idx].output = + execute_honk_recursion(static_cast(b), constraints.honk_recursion_constraints[i]); + }); + task_sizes.push_back(sizes); + task_profile_indices.push_back(profile_idx); + } + + // TODO: Chonk and AVM recursion constraints — same pattern as honk above. + // For now they fall through to Phase 4 sequential processing if present. + + // Phase 2: Prepare the builder's caches from profiles (no constraint execution). + prepare_builder_from_profiles(builder, profiles); + + // Phase 2b: Pre-create lookup tables and ROM/RAM arrays in task order (matching sequential + // constraint processing order). This ensures table indices and ROM IDs are deterministic + // and match what sequential build_constraints would produce. + for (size_t t = 0; t < tasks.size(); t++) { + const auto& profile = profiles[task_profile_indices[t]]; + for (const auto table_id : profile.table_ids) { + builder.get_table(table_id); // no-op if already created + } + for (size_t r = 0; r < profile.num_rom_arrays_per_instance; r++) { + builder.rom_ram_logic.create_ROM_array(profile.rom_array_sizes[r]); + } + for (size_t r = 0; r < profile.num_ram_arrays_per_instance; r++) { + builder.rom_ram_logic.create_RAM_array(profile.ram_array_sizes[r]); + } + } + + // Phase 3: Execute ALL instances in parallel (including recursion constraints) + if (!tasks.empty()) { + builder.execute_parallel(tasks, task_sizes, num_threads); + } + + // Phase 4: Block constraints (sequential — these reference variables from earlier constraints). + for (const auto& [constraint, opcode_indices] : + zip_view(constraints.block_constraints, constraints.original_opcode_indices.block_constraints)) { + create_block_constraints(builder, constraint); + } + + // Phase 4b: Merge recursion outputs from parallel tasks and process remaining sequential recursion. + { + HonkRecursionConstraintsOutput output; + + // Merge outputs from honk recursion tasks that ran in Phase 3 + for (size_t i = 0; i < rec_out_idx; i++) { + const auto& rec = recursion_task_outputs[i]; + output.update(rec.output, rec.update_ipa_data); + if (rec.is_root_rollup) { + output.is_root_rollup = true; + } + } + + // Chonk and AVM recursion constraints — Ultra only, sequential for now (TODO: parallelize) + if constexpr (std::is_same_v) { + for (const auto& constraint : constraints.chonk_recursion_constraints) { + auto honk_output = create_chonk_recursion_constraints(builder, constraint); + output.update(honk_output, /*update_ipa_data=*/true); + } + for (const auto& constraint : constraints.avm_recursion_constraints) { + auto honk_output = create_avm2_recursion_constraints_goblin(builder, constraint); + output.update(honk_output, /*update_ipa_data=*/true); + } + } + + // HyperNova recursion constraints (Mega only, requires IVC state — always sequential) + const bool is_hn_recursion_constraints = !constraints.hn_recursion_constraints.empty(); + if (is_hn_recursion_constraints) { + GateCounter gate_counter{ &builder, false }; + std::vector dummy_gates_per_opcode; + auto hn_output = create_recursion_constraints( + builder, + gate_counter, + dummy_gates_per_opcode, + metadata.ivc, + { {}, {} }, + { {}, {} }, + { constraints.hn_recursion_constraints, constraints.original_opcode_indices.hn_recursion_constraints }, + { {}, {} }); + output.update(hn_output, /*update_ipa_data=*/false); + } + + output.finalize(builder, is_hn_recursion_constraints, metadata.has_ipa_claim); + } +} + +template void build_constraints_parallel(UltraCircuitBuilder&, + AcirFormat&, + const ProgramMetadata&, + size_t); +template void build_constraints_parallel(MegaCircuitBuilder&, + AcirFormat&, + const ProgramMetadata&, + size_t); + } // namespace acir_format diff --git a/barretenberg/cpp/src/barretenberg/dsl/acir_format/acir_format.hpp b/barretenberg/cpp/src/barretenberg/dsl/acir_format/acir_format.hpp index b00b86881573..2c36c61f780f 100644 --- a/barretenberg/cpp/src/barretenberg/dsl/acir_format/acir_format.hpp +++ b/barretenberg/cpp/src/barretenberg/dsl/acir_format/acir_format.hpp @@ -162,4 +162,22 @@ Builder create_circuit(AcirProgram& program, const ProgramMetadata& metadata = P template void build_constraints(Builder& builder, AcirFormat& constraints, const ProgramMetadata& metadata); +/** + * @brief Parallel variant of build_constraints for UltraCircuitBuilder. + * @details Processes each constraint type's instances in parallel using execute_parallel. + * For each type with N instances: runs 1 warmup instance sequentially, measures per-instance + * block sizes, then processes remaining N-1 instances across num_threads threads. + * Produces a bit-identical circuit to build_constraints. + * + * @param builder Must be constructed with the witness and public inputs already set + * @param constraints The ACIR constraints to process + * @param metadata Program metadata + * @param num_threads Number of threads for parallel execution + */ +template +void build_constraints_parallel(Builder& builder, + AcirFormat& constraints, + const ProgramMetadata& metadata, + size_t num_threads); + } // namespace acir_format diff --git a/barretenberg/cpp/src/barretenberg/dsl/acir_format/parallel_circuit_construction_poc.md b/barretenberg/cpp/src/barretenberg/dsl/acir_format/parallel_circuit_construction_poc.md new file mode 100644 index 000000000000..25df3bed0455 --- /dev/null +++ b/barretenberg/cpp/src/barretenberg/dsl/acir_format/parallel_circuit_construction_poc.md @@ -0,0 +1,372 @@ +# Parallel ACIR Circuit Construction + +## Status: PoC complete, ready for production PR + +The core parallel execution mechanism is proven and tested. The originally-planned precursor +refactor (separating setup gates from constraint gates in sequential `build_constraints`) turned +out to be unnecessary — see "Key insight" below. + +## Key insight: no precursor PR needed + +The original plan called for a two-PR approach: first refactor sequential `build_constraints` to +pre-create setup state, then add the parallel path. The concern was that sequential and parallel +paths would produce different circuits because setup gates land in different positions. + +**The realization:** `build_constraints_parallel` with N=1 threads already produces bit-identical +circuits to N=2 threads. Both paths go through `prepare_builder_from_profiles` (which pre-creates +constants, range lists, and lookup tables), so setup gates land in the same position regardless of +thread count. Sequential execution is just the N=1 special case of parallel execution. + +**Validated:** `BuildConstraintsParallelN1vsN2` test passes — full wire-by-wire, selector-by-selector, +variable, and union-find comparison between 1-thread and 2-thread parallel construction. Zero +mismatches. + +This means we can ship the parallel infrastructure in a single PR: +1. Wire `build_constraints_parallel` into `create_circuit` for `UltraCircuitBuilder` +2. The old sequential `build_constraints` remains for other builder types (Mega) +3. Update VKs (they change because setup gates move to the beginning) +4. All existing tests pass — the parallel path with any thread count is a drop-in replacement + +**Note on scope:** The builder changes (cursors, deferred buffers, `execute_parallel`) only affect +the ACIR construction path. Direct C++ circuit construction is unchanged — cursor mode is opt-in, +entered only through `execute_parallel`. The lazy-init behavior of `put_constant_variable`, +`create_range_list`, and `get_table` is preserved for all non-ACIR usage. + +## CRITICAL INVARIANT: Bit-identical circuits for any N + +**Circuits produced by `build_constraints_parallel` MUST be bit-identical regardless of thread +count.** N=1, N=2, N=32 must all produce the exact same circuit — same wires, same selectors, +same variable indices, same union-find. This is non-negotiable because: + +- Different circuits produce different VKs +- VKs are hardcoded in the protocol (Aztec L1 contracts) +- If the circuit depends on thread count, different machines with different core counts would + produce incompatible proofs +- The verifier must be able to verify proofs from any prover regardless of hardware + +**This means:** Any mechanism that could produce different gate counts or variable indices based +on thread assignment is a bug. This includes: +- Gate fusion that depends on task-to-thread assignment +- Shared mutable state accessed in nondeterministic order +- Any code path that reads `block.size()` in cursor mode (returns pre-allocated total, not cursor) + +## Next step: production PR + +**Key files:** +- `barretenberg/cpp/src/barretenberg/dsl/acir_format/acir_format.cpp` — wire `build_constraints_parallel` + into `create_circuit` +- `barretenberg/cpp/src/barretenberg/dsl/acir_format/acir_format.hpp` — declarations +- Builder files (already modified on PoC branch): `execution_trace_block.hpp`, `circuit_builder_base.hpp`, + `circuit_builder_base_impl.hpp`, `ultra_circuit_builder.hpp`, `ultra_circuit_builder.cpp` + +**Verification:** +- All existing tests must pass (no behavioral change for constraint correctness) +- `CircuitChecker::check()` must pass on all circuits +- `BuildConstraintsParallelN1vsN2` validates bit-identical circuits across thread counts +- VKs WILL change — run `barretenberg/cpp/scripts/test_chonk_standalone_vks_havent_changed.sh --update_inputs` + to pin new VKs after verifying correctness + +## Motivation + +Circuit construction from ACIR is entirely single-threaded — the biggest sequential bottleneck as +core count increases and as GPU-accelerated proving makes parallel work (MSM, sumcheck) cheaper. + +Measured on real transaction (`ecdsar1+transfer_1_recursions+sponsored_fpc`, 11 circuits): + +| Cores | Total prove time | create_circuit | ProverInstance | Sequential circuit % | +|-------|------------------|----------------|----------------|---------------------| +| 1 | ~25s | 844ms | 815ms | ~6.6% | +| 8 | ~5.4s | 709ms | 497ms | ~22% | +| 32 | 5.58s | 702ms (12.6%) | 528ms (9.5%) | **22%** | + +With GPU proving, sequential circuit construction could become 50%+ of total time. + +## What the PoC proved + +### Core mechanism: cursor-based parallel writes + +Threads write to pre-allocated regions of a shared builder using per-thread cursors. No builder +duplication, no merge, no wire index remapping. The existing stdlib code (SHA256, Poseidon2, etc.) +is completely unmodified — it calls the same `populate_wires`, `emplace_back`, `add_variable` APIs, +which internally route through cursors when in parallel mode. + +**Tested with:** Two `std::thread`s running SHA256 + Poseidon2 concurrently on a shared builder. +Zero mismatches across all blocks, selectors, variables, and union-find. 500/500 stress test. + +### Copy constraints (assert_equal) are safe under concurrency + +Each opcode's `assert_equal` calls only touch its own internal variables and its own unique output +witnesses. The union-find modifications operate on disjoint variable sets across threads. No +deferral needed. + +**Key invariant:** ACIR output witnesses are unique per opcode. No two opcodes call `assert_equal` +on the same ACIR witness. The stdlib pattern is always: read inputs → compute → assert outputs +equal witnesses. The "ripple" from `update_real_variable_indices` stays contained within one opcode. + +**Tested with:** Chained SHA256 opcodes where A's output witnesses are B's input witnesses, running +on real concurrent threads. Union-find bit-identical to sequential (verified all 43,704 variables). + +### Range constraints and lookup gates: deferred per-thread, replayed after join + +These are the only operations that append to shared collections (`range_lists[target].variable_indices` +and `table.lookup_gates`). Both are order-independent: +- `variable_indices` gets sorted and deduplicated in `process_range_list` +- `lookup_gates` just counts occurrences in `construct_lookup_read_counts` + +Per-thread buffers, concatenated in deterministic thread order after join. + +**Tested with:** 5,792 deferred lookup entries + 458 deferred range constraints from chained SHA256, +all replayed correctly. Finalized circuits pass CircuitChecker. + +### Gate construction is deterministic across threads + +After finalization, every block (arithmetic, lookup, delta_range, elliptic, poseidon2_external, +poseidon2_internal, pub_inputs) is bit-identical between sequential and parallel construction — +verified wire-by-wire and selector-by-selector. Variable counts match. Union-find matches. + +### `put_constant_variable` is safe with read-only cache after warmup + +All constants for tested opcodes (SHA256, Poseidon2) are fully covered by one warmup instance. +In cursor mode, the cache is read-only — lookups are safe for concurrent reads (no writers). +Zero cache misses observed during parallel phase. + +### `execute_parallel` orchestrator (production-ready) + +Lives on the builder (`UltraCircuitBuilder::execute_parallel`). Takes a vector of task lambdas +and pre-computed per-task sizes. Handles: +- Pre-allocation of all blocks and variables +- Per-thread cursor setup (on main thread, avoiding resize races) +- Thread dispatch with `set_parallel_thread_index` +- Deferred operation replay after join +- 500/500 stress test, zero race conditions + +### Profile-based planning (simulates eventual table lookup) + +`profile_constraint_type` runs a constraint on a throwaway builder and extracts: +- `TaskBlockSizes` (per-block gate counts + variable count) +- Constants to pre-register +- Range list targets to pre-create +- Lookup table IDs to pre-create + +`prepare_builder_from_profiles` populates the real builder's caches from this data without +executing any constraints. This is the interface the table lookup will eventually implement. + +## Real circuit analysis + +### Aztec transaction opcode breakdown (ecdsar1+transfer_0_recursions+sponsored_fpc) + +| Circuit | Opcodes | Gates | Key constraint types | +|---------|--------:|------:|---------------------| +| EcdsaRAccount:entrypoint | 8,938 | 78,062 | 2000 quad, 468 range, 1 sha256, 29 pos2, 1 ecdsa_r1 | +| private_kernel_init | 8,913 | 44,239 | 6253 quad, 1218 range, 69 pos2, 1 msm | +| private_kernel_inner | 19,697 | 95,347 | 10724 quad, 3683 big_quad, 2658 range, 69 pos2 | +| Token:transfer | 22,600 | 79,563 | 11675 quad, 3752 range, 57 pos2, 8 msm, 6 aes128 | +| private_kernel_reset | 29,586 | 102,252 | 16018 quad, 3403 big_quad, 4447 range, 375 pos2 | +| private_kernel_tail | 9,096 | 43,186 | 6634 quad, 1402 range, 11 pos2 | +| hiding_kernel | 1,502 | 36,180 | 1413 quad, 80 range, 7 pos2 | + +### Per-block gate breakdown for private_kernel_inner (95,347 gates) + +| Block | Gates | % | +|-------|------:|--:| +| arithmetic | 40,636 | 42.6% | +| poseidon2_internal | 39,216 | 41.1% | +| poseidon2_external | 6,880 | 7.2% | +| elliptic | 336 | 0.4% | + +No single opcode dominates. 69 Poseidon2 instances produce 48% of gates. With table lookup +(no warmup), all ~19,600 constraints distribute across threads with near-linear speedup. + +## Builder changes implemented + +### execution_trace_block.hpp +- Per-thread cursor arrays (`std::vector cursors_`) on `Selector` and `ExecutionTraceBlock` +- `thread_local parallel_thread_idx` for routing operations to correct cursor +- `enable_cursor_mode(thread_idx, start)` / `disable_cursor_mode(thread_idx)` +- `populate_wires` and selector writes route through `active_cursor()` / `active_cursor_ref()` +- `last_gate_index()` / `next_gate_index()` for cursor-aware gate position queries +- `wire_cursor_start()` for tracking task boundary (prevents cross-task gate fusion) + +### circuit_builder_base.hpp +- Per-thread variable cursor array (`std::vector variable_cursors_`) +- `enable_variable_cursor(thread_idx, start)` / `disable_variable_cursor(thread_idx)` +- `resize_variables(total_size)` for pre-allocation +- `get_variable_cursor()` routes through `parallel_thread_idx` +- `increment_num_gates` skipped in cursor mode +- `get_next_var_index()` / `get_prev_var_index()` const accessors + +### ultra_circuit_builder.hpp +- `TaskBlockSizes` struct + `snapshot_block_sizes()` / `delta()` +- `execute_parallel()` orchestrator +- Per-thread deferred buffers for lookup gates and range constraints +- `init_deferred_buffers()` / `apply_deferred_lookup_gates()` / `apply_deferred_range_constraints()` +- `update_used_witnesses` / `update_finalize_witnesses` skipped in cursor mode + +### ultra_circuit_builder.cpp +- `put_constant_variable`: read-only cache bypass in cursor mode +- `create_small_range_constraint`: deferral in cursor mode +- `create_gates_from_plookup_accumulators`: lookup gate deferral in cursor mode +- `create_ecc_add_gate` / `create_ecc_dbl_gate`: cursor-aware gate fusion (uses cursor position + instead of `block.size()` to find previous gate; fusion disabled at task boundaries) + +### rom_ram_logic.hpp / rom_ram_logic.cpp +- Per-thread ROM/RAM ID cursors for pre-allocated array assignment +- `create_ROM_array` / `create_RAM_array`: cursor-mode returns pre-assigned IDs +- `gate_index` recording uses `last_gate_index()` / `next_gate_index()` instead of `block.size()` + +### acir_format.cpp +- `profile_constraint_type()`: throwaway builder measurement (separate pre-warmed builder to + avoid cross-instance gate fusion in profiling) +- `prepare_builder_from_profiles()`: cache population from profiles +- `build_constraints_parallel()`: full parallel orchestration +- Constraint type grouping by gate-count-affecting parameters (range by num_bits, big_quad by + size(), logic by (num_bits, is_xor_gate), aes128/blake2s/blake3 by inputs.size(), poseidon2 + by state.size(), multi_scalar_mul by points.size()) + +## Shared state audit + +| State | Category | Solution | Verified | +|-------|----------|----------|----------| +| Block gate writes | Partitionable | Per-thread cursors | Yes (500/500) | +| `add_variable` | Partitionable | Per-thread variable cursors | Yes | +| `assert_equal` / union-find | Naturally disjoint | No change needed | Yes (43k vars) | +| `put_constant_variable` | Read-only after warmup | Cache bypass in cursor mode | Yes (0 misses) | +| Range list creation | One-time init | Pre-created from profiles | Yes | +| Plookup table creation | One-time init | Pre-created from profiles | Yes | +| `create_small_range_constraint` | Deferred | Per-thread buffer, replay | Yes (458 entries) | +| `table.lookup_gates` append | Deferred | Per-thread buffer, replay | Yes (5792 entries) | +| `update_used_witnesses` | Skip in cursor mode | Boomerang detection only | Yes | +| `update_finalize_witnesses` | Skip in cursor mode | Finalize detection only | Yes | +| `increment_num_gates` | Skip in cursor mode | Pre-computed total | Yes | +| ROM/RAM array creation | Pre-allocated | Per-thread ID cursors | Yes | +| `memory_read/write_records` | Gate index recording | Uses `last_gate_index()` | Yes | +| ECC gate fusion | Cursor-aware | Uses cursor position, not block.size() | In progress | + +## The remaining blocker: setup gate ordering + +### The problem + +In sequential `build_constraints`, the first constraint of each type triggers one-time setup: +- Range list staircase creation (`create_range_list` → arithmetic gates + variables) +- Lookup table initialization (`get_table` → populates `lookup_tables`) +- Constant registration (`put_constant_variable` → `fix_witness` → arithmetic gate) + +These setup gates are interleaved with the first constraint's own gates. Their position in the +circuit affects the VK. + +In parallel mode, `prepare_builder_from_profiles` creates setup gates separately before any +constraints run. The setup gates land at different positions → different circuit → different VK. + +Both circuits are valid (both pass CircuitChecker), but they are NOT identical. + +### The solution: precursor refactor + +Change the sequential `build_constraints` path to separate setup from execution. This is a +standalone change with no parallel code — just reordering when setup gates are created. + +### What currently happens (implicit setup) + +When `build_constraints` processes constraints sequentially, the first constraint of each type +triggers lazy initialization: + +1. **`put_constant_variable(value)`** — if the value isn't cached, creates a new variable + + `fix_witness` gate (1 arithmetic gate). Every subsequent call with the same value returns the + cached index. SHA256 creates ~900 unique constants on its first invocation; the 2nd+ SHA256 + finds them all cached. These `fix_witness` gates are interleaved with the first constraint's + own gates in the arithmetic block. + +2. **`create_range_list(target_range)`** — called lazily from `create_small_range_constraint` when + a range target hasn't been seen before. Creates a "staircase" of sorted padding variables + + unconstrained arithmetic gates (e.g., SHA256 triggers 5 range lists costing 1371 arithmetic + gates). These gates are interleaved with the first constraint that triggers each range. + +3. **`get_table(table_id)`** — called lazily from `create_gates_from_plookup_accumulators` when a + lookup table hasn't been created yet. Appends to `lookup_tables`. No gates are created, but the + table must exist before any plookup reads reference it. + +### What needs to change (explicit setup) + +Add a setup phase at the beginning of `build_constraints` that pre-creates all setup state before +any constraint processing. This requires knowing which constraint types are present in the program. + +**Concrete changes to `build_constraints` in `acir_format.cpp`:** + +1. **Pre-register constants.** Before the constraint loops, call `put_constant_variable(v)` for + every constant value that any constraint type will need. The set of constants per constraint + type is deterministic (SHA256 always needs the same ~900 constants, Poseidon2 needs ~5, etc.). + Source: `ConstraintProfile::constants` from `profile_constraint_type`, or eventually a stored + table. + +2. **Pre-create range lists.** Before the constraint loops, call `create_range_list(target)` for + every range target that any constraint type will need. Source: + `ConstraintProfile::range_list_targets`, or eventually a stored table. SHA256 needs targets + {1, 3, 7, 15, 16383}. Most opcodes need only {16383} (DEFAULT_PLOOKUP_RANGE_SIZE). + +3. **Pre-create lookup tables.** Before the constraint loops, call `get_table(id)` for every + BasicTableId that any constraint type will need. Source: `ConstraintProfile::table_ids`, or + eventually a stored table. SHA256 needs SHA256 lookup tables; logic constraints need XOR/AND + tables; etc. + +**The key invariant:** After the setup phase, no constraint execution triggers `put_constant_variable` +cache misses, `create_range_list` calls, or new `get_table` calls. Every constraint — whether it's +the 1st or 100th of its type — produces identical gates. + +**Where the setup data comes from (now vs later):** +- **Now (PoC):** `profile_constraint_type` runs constraints on a throwaway builder and extracts + the constants/ranges/tables. This is slow (2x work per type) but correct. +- **Later (production):** A stored table keyed by `(constraint_type, parameters)` provides the + same data as a compile-time lookup. The table is generated once and validated by pinning tests. + +### Effect on the circuit + +The setup gates (fix_witness for constants, range list staircases) move from being interleaved +with the first constraint of each type to being grouped at the beginning of the circuit. This +changes: +- Gate ordering within the arithmetic block +- Variable indices (constants get earlier indices) +- The VK (different gate positions → different polynomials) + +It does NOT change: +- The set of constraints (same gates, just reordered) +- The satisfying witness assignment +- Circuit correctness (both old and new pass CircuitChecker and prove/verify) + +### Implementation plan + +**Single PR:** Ship the parallel infrastructure and wire it into `create_circuit` for Ultra. + +1. Extract all PoC changes (builder + acir_format) into a clean branch off `merge-train/barretenberg` +2. Wire `build_constraints_parallel` into `create_circuit` (replacing + `build_constraints` for Ultra only; Mega and other builders continue using the sequential path) +3. Enable `BuildConstraintsParallelN1vsN2` test to validate bit-identical circuits across thread counts +4. Run full test suite (`dsl_tests`, `ultra_honk_tests`, `chonk_tests`) +5. Update VKs via `test_chonk_standalone_vks_havent_changed.sh --update_inputs` +6. Later: replace `profile_constraint_type` (throwaway builder) with stored lookup tables for + production performance + +## Files modified in PoC + +| File | Change | +|------|--------| +| `honk/execution_trace/execution_trace_block.hpp` | Per-thread cursors, thread-local index, `last_gate_index()`, `next_gate_index()`, `wire_cursor_start()` | +| `stdlib_circuit_builders/circuit_builder_base.hpp` | Per-thread variable cursors, resize, accessors | +| `stdlib_circuit_builders/circuit_builder_base_impl.hpp` | Cursor-aware add_variable | +| `stdlib_circuit_builders/ultra_circuit_builder.hpp` | execute_parallel, deferred buffers, TaskBlockSizes, ROM/RAM cursor management | +| `stdlib_circuit_builders/ultra_circuit_builder.cpp` | put_constant_variable bypass, deferral checks, cursor-aware ECC gate fusion | +| `stdlib_circuit_builders/rom_ram_logic.hpp` | Per-thread ROM/RAM ID cursors | +| `stdlib_circuit_builders/rom_ram_logic.cpp` | Cursor-mode ROM/RAM creation, cursor-aware gate_index recording | +| `dsl/acir_format/acir_format.hpp` | build_constraints_parallel declaration | +| `dsl/acir_format/acir_format.cpp` | profile_constraint_type, prepare_builder_from_profiles, build_constraints_parallel, constraint grouping | +| `dsl/acir_format/per_block_gate_count.test.cpp` | All PoC tests | + +## Tests + +| Test | What it verifies | +|------|-----------------| +| `RealParallelChainedSha256` | Bit-identical circuit (full wire/selector/variable/union-find comparison) with real witness values, CircuitChecker on both, chained data dependencies, 5792 deferred lookups + 458 deferred ranges | +| `BuildConstraintsParallelN1vsN2` | Real AcirProgram through `build_constraints_parallel` with 1 vs 2 threads, full wire/selector/variable/union-find comparison — validates that sequential is just the N=1 case of parallel | +| `SequentialVsParallelSemanticEquivalence` | Sequential `build_constraints` vs `build_constraints_parallel` — same block sizes, variable counts, copy cycles, constants, range lists, lookup tables | +| `AcirTestParallelEquivalence` | Parameterized over all acir_tests — 3-way comparison (sequential, N=1, N=2) with semantic equivalence and bit-identical checks | +| `IsolatedVsSharedSelectorEquivalence` | Selector equivalence between isolated and shared warmed builders | +| `WarmedAdditivityComprehensive` | Gate count additivity across 5 opcode types after warmup | +| Individual opcode measurements | Per-block gate counts for Quad, SHA256, Poseidon2, EC Add, Logic XOR | diff --git a/barretenberg/cpp/src/barretenberg/dsl/acir_format/per_block_gate_count.test.cpp b/barretenberg/cpp/src/barretenberg/dsl/acir_format/per_block_gate_count.test.cpp new file mode 100644 index 000000000000..4c0a12281271 --- /dev/null +++ b/barretenberg/cpp/src/barretenberg/dsl/acir_format/per_block_gate_count.test.cpp @@ -0,0 +1,956 @@ +/** + * @file per_block_gate_count.test.cpp + * @brief Measures per-block gate counts for each ACIR opcode type, and tests whether they are additive across opcodes. + * + * @details This is a PoC investigating whether ACIR circuit construction can be parallelized via a "plan then execute" + * model. The key question: if we know the per-block gate count for each opcode, can we pre-compute a deterministic + * layout (prefix sum of per-block sizes), then execute opcodes in parallel into pre-allocated regions? + * + * Step 1: Measure per-block gate counts for individual opcodes. + * Step 2: Test additivity — does the sum of individual per-block counts match a combined circuit? + */ + +#include + +#include "acir_format.hpp" +#include "acir_to_constraint_buf.hpp" +#include "barretenberg/circuit_checker/circuit_checker.hpp" +#include "barretenberg/common/get_bytecode.hpp" +#include "barretenberg/crypto/poseidon2/poseidon2.hpp" +#include "barretenberg/crypto/sha256/sha256.hpp" +#include "barretenberg/dsl/acir_format/poseidon2_constraint.hpp" +#include "barretenberg/dsl/acir_format/sha256_constraint.hpp" +#include "barretenberg/dsl/acir_format/test_class.hpp" +#include "barretenberg/dsl/acir_format/utils.hpp" +#include "barretenberg/special_public_inputs/special_public_inputs.hpp" +#include "barretenberg/stdlib_circuit_builders/mega_circuit_builder.hpp" +#include "barretenberg/stdlib_circuit_builders/ultra_circuit_builder.hpp" +#include "barretenberg/ultra_honk/prover_instance.hpp" +#include "barretenberg/ultra_honk/ultra_prover.hpp" + +#include + +using namespace bb; +using namespace acir_format; + +class PerBlockGateCountTests : public ::testing::Test { + protected: + static void SetUpTestSuite() { bb::srs::init_file_crs_factory(bb::srs::bb_crs_path()); } +}; + +// Helper to build the test program: 3 SHA256 + 3 Poseidon2 +AcirFormat build_sha256_poseidon2_test_program(WitnessVector& witness_out) +{ + std::vector all_opcodes; + + // 3 SHA256 compression constraints, each using 32 witnesses + for (uint32_t i = 0; i < 3; i++) { + uint32_t base = i * 32; + Sha256Compression sha; + for (size_t j = 0; j < 16; ++j) + sha.inputs[j] = WitnessOrConstant::from_index(base + static_cast(j)); + for (size_t j = 0; j < 8; ++j) + sha.hash_values[j] = WitnessOrConstant::from_index(base + static_cast(j)); + for (size_t j = 0; j < 8; ++j) + sha.result[j] = base + static_cast(j) + 24; + auto ops = constraint_to_acir_opcode(sha); + all_opcodes.insert(all_opcodes.end(), ops.begin(), ops.end()); + } + + // 3 Poseidon2 constraints, each using 8 witnesses, starting after SHA256 witnesses + for (uint32_t i = 0; i < 3; i++) { + uint32_t base = 96 + i * 8; + Poseidon2Constraint pos; + for (uint32_t j = 0; j < 4; j++) { + pos.state.emplace_back(WitnessOrConstant::from_index(base + j)); + pos.result.emplace_back(base + 4 + j); + } + auto ops = constraint_to_acir_opcode(pos); + all_opcodes.insert(all_opcodes.end(), ops.begin(), ops.end()); + } + + Acir::Circuit circuit = build_acir_circuit(all_opcodes); + witness_out = WitnessVector(120, fr(0)); + return circuit_serde_to_acir_format(circuit); +} + +// N=1 parallel vs N=2 parallel: should be bit-identical since both go through +// prepare_builder_from_profiles and execute_parallel. +TEST_F(PerBlockGateCountTests, ParallelN1vsN2BitIdentical) +{ + WitnessVector witness; + AcirFormat constraint_system = build_sha256_poseidon2_test_program(witness); + + // Build with 1 thread + AcirFormat n1_constraints = constraint_system; + UltraCircuitBuilder n1_builder{ WitnessVector(witness), n1_constraints.public_inputs, false }; + build_constraints_parallel(n1_builder, n1_constraints, ProgramMetadata{}, /*num_threads=*/1); + + // Build with 2 threads + AcirFormat n2_constraints = constraint_system; + UltraCircuitBuilder n2_builder{ WitnessVector(witness), n2_constraints.public_inputs, false }; + build_constraints_parallel(n2_builder, n2_constraints, ProgramMetadata{}, /*num_threads=*/2); + + // Both must pass circuit checker + EXPECT_TRUE(CircuitChecker::check(n1_builder)); + EXPECT_TRUE(CircuitChecker::check(n2_builder)); + + // Bit-identical: every block's wires and selectors must match + auto n1_blocks = n1_builder.blocks.get(); + auto n2_blocks = n2_builder.blocks.get(); + for (size_t b = 0; b < UltraCircuitBuilder::ExecutionTrace::NUM_BLOCKS; b++) { + EXPECT_EQ(n1_blocks[b].size(), n2_blocks[b].size()) << "block " << b << " size mismatch"; + size_t count = std::min(n1_blocks[b].size(), n2_blocks[b].size()); + + size_t wire_mismatches = 0; + for (size_t w = 0; w < 4; w++) { + for (size_t i = 0; i < count; i++) { + if (n1_blocks[b].wires[w][i] != n2_blocks[b].wires[w][i]) + wire_mismatches++; + } + } + EXPECT_EQ(wire_mismatches, 0) << "block " << b << ": " << wire_mismatches << " wire mismatches"; + + auto n1_sels = n1_blocks[b].get_selectors(); + auto n2_sels = n2_blocks[b].get_selectors(); + size_t sel_mismatches = 0; + for (size_t s = 0; s < n1_sels.size(); s++) { + for (size_t i = 0; i < count; i++) { + if (n1_sels[s][i] != n2_sels[s][i]) + sel_mismatches++; + } + } + EXPECT_EQ(sel_mismatches, 0) << "block " << b << ": " << sel_mismatches << " selector mismatches"; + } + + // Variable counts and union-find must match exactly + EXPECT_EQ(n1_builder.get_num_variables(), n2_builder.get_num_variables()); + size_t num_vars = std::min(n1_builder.get_num_variables(), n2_builder.get_num_variables()); + size_t real_idx_mismatches = 0; + for (size_t i = 0; i < num_vars; i++) { + if (n1_builder.real_variable_index[i] != n2_builder.real_variable_index[i]) + real_idx_mismatches++; + } + EXPECT_EQ(real_idx_mismatches, 0) << "real_variable_index mismatches"; +} + +// Helper: create a valid UltraHonk proof and convert it to a RecursionConstraint. +// Returns the constraint and the witness vector containing proof/VK data. +std::pair create_honk_recursion_test_data() +{ + using InnerFlavor = UltraFlavor; + using InnerBuilder = UltraCircuitBuilder; + using InnerProverInstance = ProverInstance_; + using InnerProver = UltraProver; + using InnerIO = stdlib::recursion::honk::DefaultIO; + + // Create a simple inner circuit: one mul gate + default public inputs + InnerBuilder inner_builder; + auto a = inner_builder.add_variable(fr::random_element()); + auto b = inner_builder.add_variable(fr::random_element()); + auto c = inner_builder.add_variable(inner_builder.get_variable(a) * inner_builder.get_variable(b)); + inner_builder.create_big_mul_add_gate({ .a = a, + .b = b, + .c = c, + .d = inner_builder.zero_idx(), + .mul_scaling = 1, + .a_scaling = 0, + .b_scaling = 0, + .c_scaling = -1, + .d_scaling = 0, + .const_scaling = 0 }); + InnerIO::add_default(inner_builder); + + auto prover_instance = std::make_shared(inner_builder); + auto verification_key = std::make_shared(prover_instance->get_precomputed()); + InnerProver prover(prover_instance, verification_key); + auto proof = prover.construct_proof(); + + WitnessVector witness; + RecursionConstraint constraint = + recursion_data_to_recursion_constraint(witness, + proof, + verification_key->to_field_elements(), + verification_key->hash(), + bb::fr::one(), + inner_builder.num_public_inputs() - InnerIO::PUBLIC_INPUTS_SIZE, + HONK); + + return { constraint, witness }; +} + +// Forward declarations for functions defined later in this file +size_t check_semantic_equivalence(const std::string& label, UltraCircuitBuilder& a, UltraCircuitBuilder& b); +size_t check_bit_identical(const std::string& label, UltraCircuitBuilder& a, UltraCircuitBuilder& b); +std::filesystem::path find_acir_tests_dir(); + +// Test that a circuit with a HONK recursion constraint passes CircuitChecker +// when built through the sequential and parallel paths. +TEST_F(PerBlockGateCountTests, RecursionConstraintBasic) +{ + auto [recursion_constraint, witness] = create_honk_recursion_test_data(); + + AcirFormat constraints{}; + constraints.honk_recursion_constraints = { recursion_constraint }; + constraints.original_opcode_indices.honk_recursion_constraints = { 0 }; + constraints.num_acir_opcodes = 1; + constraints.max_witness_index = static_cast(witness.size() - 1); + ProgramMetadata metadata{}; + + // Fix predicate to constant true (matching production Noir circuits) + constraints.honk_recursion_constraints[0].predicate = WitnessOrConstant::from_constant(bb::fr(1)); + + // Step 2: Use Mega for smaller circuits. Build parallel first, then sequential with same pre-warming. + // Mega parallel N=1 + AcirFormat par_constraints = constraints; + MegaCircuitBuilder par_builder{ + std::make_shared(), WitnessVector(witness), par_constraints.public_inputs, false + }; + build_constraints_parallel(par_builder, par_constraints, metadata, /*num_threads=*/1); + info(" Mega Parallel N=1: vars=", par_builder.get_num_variables()); + + // Mega sequential with same constants pre-registered + AcirFormat seq_constraints = constraints; + MegaCircuitBuilder seq_builder{ + std::make_shared(), WitnessVector(witness), seq_constraints.public_inputs, false + }; + for (const auto& [val, _] : par_builder.constant_variable_indices) { + seq_builder.put_constant_variable(val); + } + for (const auto& [target, rl] : par_builder.range_lists) { + if (seq_builder.range_lists.count(target) == 0) { + seq_builder.range_lists.insert({ target, seq_builder.create_range_list(target) }); + } + } + build_constraints(seq_builder, seq_constraints, metadata); + info(" Mega Sequential (pre-warmed): vars=", seq_builder.get_num_variables()); + + // Compare + EXPECT_EQ(par_builder.get_num_variables(), seq_builder.get_num_variables()) << "Variable count mismatch"; + { + auto pb = par_builder.blocks.get(); + auto sb = seq_builder.blocks.get(); + for (size_t bl = 0; bl < MegaCircuitBuilder::ExecutionTrace::NUM_BLOCKS; bl++) { + EXPECT_EQ(pb[bl].size(), sb[bl].size()) << "Block " << bl << " size mismatch"; + } + } + // Copy cycles + { + auto collect_cycles = [](auto& builder) { + std::map root_sizes; + for (size_t i = 0; i < builder.get_num_variables(); i++) { + root_sizes[builder.real_variable_index[i]]++; + } + std::vector> cycles; + for (const auto& [root, sz] : root_sizes) { + cycles.emplace_back(builder.get_variable(root), sz); + } + std::sort(cycles.begin(), cycles.end(), [](const auto& x, const auto& y) { + return x.second != y.second ? x.second < y.second : x.first < y.first; + }); + return cycles; + }; + auto par_cycles = collect_cycles(par_builder); + auto seq_cycles = collect_cycles(seq_builder); + size_t cycle_mismatches = 0; + if (par_cycles.size() == seq_cycles.size()) { + for (size_t i = 0; i < par_cycles.size(); i++) { + if (par_cycles[i] != seq_cycles[i]) + cycle_mismatches++; + } + } + info(" Copy cycles: ", par_cycles.size(), " vs ", seq_cycles.size(), ", mismatches=", cycle_mismatches); + } + // Gate multiset for block 4 (arithmetic in Mega) + { + auto pb = par_builder.blocks.get(); + auto sb = seq_builder.blocks.get(); + size_t bl = 4; // arithmetic + if (pb[bl].size() == sb[bl].size() && pb[bl].size() > 0) { + size_t count = pb[bl].size(); + auto ps = pb[bl].get_selectors(); + auto ss = sb[bl].get_selectors(); + size_t ts = 4 + ps.size(); + auto ct = [&](const auto& blk, const auto& sels, auto& builder) { + std::vector> tuples; + tuples.reserve(count); + for (size_t i = 0; i < count; i++) { + std::vector t(ts); + for (size_t w = 0; w < 4; w++) + t[w] = builder.get_variable(blk.wires[w][i]); + for (size_t s = 0; s < sels.size(); s++) + t[4 + s] = sels[s][i]; + tuples.push_back(std::move(t)); + } + std::sort(tuples.begin(), tuples.end()); + return tuples; + }; + auto pt = ct(pb[bl], ps, par_builder); + auto st = ct(sb[bl], ss, seq_builder); + info(" Block 4 (arithmetic) multiset: ", pt == st ? "MATCH" : "MISMATCH", " (", count, " gates)"); + EXPECT_TRUE(pt == st) << "Gate multiset mismatch in block 4"; + } + } + + // CircuitChecker on both + EXPECT_TRUE(CircuitChecker::check(par_builder)) << "Parallel N=1 failed CircuitChecker"; + EXPECT_TRUE(CircuitChecker::check(seq_builder)) << "Sequential failed CircuitChecker"; + + // N=1 vs N=2 bit-identical + { + AcirFormat par2_constraints = constraints; + MegaCircuitBuilder par2_builder{ + std::make_shared(), WitnessVector(witness), par2_constraints.public_inputs, false + }; + build_constraints_parallel(par2_builder, par2_constraints, metadata, /*num_threads=*/2); + info(" Mega Parallel N=2: vars=", par2_builder.get_num_variables()); + EXPECT_EQ(par_builder.get_num_variables(), par2_builder.get_num_variables()) << "N=1 vs N=2 var count"; + + size_t n1_n2_diffs = 0; + for (size_t i = 0; i < par_builder.get_num_variables(); i++) { + if (par_builder.real_variable_index[i] != par2_builder.real_variable_index[i]) + n1_n2_diffs++; + } + info(" N=1 vs N=2 real_variable_index diffs: ", n1_n2_diffs); + EXPECT_EQ(n1_n2_diffs, 0) << "N=1 vs N=2 not bit-identical"; + } + + // Quick Ultra check: does the same test fail with Ultra? + { + AcirFormat ultra_par_c = constraints; + UltraCircuitBuilder ultra_par{ WitnessVector(witness), ultra_par_c.public_inputs, false }; + build_constraints_parallel(ultra_par, ultra_par_c, metadata, /*num_threads=*/1); + + AcirFormat ultra_seq_c = constraints; + UltraCircuitBuilder ultra_seq{ WitnessVector(witness), ultra_seq_c.public_inputs, false }; + for (const auto& [val, _] : ultra_par.constant_variable_indices) { + ultra_seq.put_constant_variable(val); + } + for (const auto& [target, rl] : ultra_par.range_lists) { + if (ultra_seq.range_lists.count(target) == 0) { + ultra_seq.range_lists.insert({ target, ultra_seq.create_range_list(target) }); + } + } + build_constraints(ultra_seq, ultra_seq_c, metadata); + info(" Ultra: par vars=", ultra_par.get_num_variables(), " seq vars=", ultra_seq.get_num_variables()); + + size_t ultra_failures = check_semantic_equivalence("recursion Ultra seq-vs-par", ultra_seq, ultra_par); + info(" Ultra seq-vs-par: ", ultra_failures, " failures"); + } +} + +// Test recursion constraint alongside other constraint types in the parallel pipeline. +// Uses Mega builder for speed. The recursion constraint runs in Phase 4 (sequential), +// while quads and ranges run in Phase 3 (parallel). +TEST_F(PerBlockGateCountTests, RecursionWithOtherConstraints) +{ + auto [recursion_constraint, rec_witness] = create_honk_recursion_test_data(); + + // Build an AcirFormat with: the recursion constraint + some quad constraints + some range constraints. + // The quads and ranges use witness indices beyond the recursion witness range. + uint32_t rec_max_witness = static_cast(rec_witness.size() - 1); + + // Create 4 quad constraints using fresh witnesses after the recursion witness range + std::vector quads; + uint32_t w = rec_max_witness + 1; + for (int i = 0; i < 4; i++) { + quads.push_back({ .a = w, + .b = w + 1, + .c = w + 2, + .d = w + 3, + .mul_scaling = 1, + .a_scaling = 0, + .b_scaling = 0, + .c_scaling = -1, + .d_scaling = 0, + .const_scaling = 0 }); + w += 4; + } + + // Create 4 range constraints on fresh witnesses + std::vector ranges; + for (int i = 0; i < 4; i++) { + ranges.push_back({ .witness = w, .num_bits = 8 }); + w++; + } + + uint32_t total_witnesses = w; + + // Extend witness vector with valid values for the new constraints + WitnessVector witness = rec_witness; + witness.resize(total_witnesses, fr(0)); + // Fill quad witnesses: a*b = c + uint32_t qw = rec_max_witness + 1; + for (int i = 0; i < 4; i++) { + fr a_val = fr::random_element(); + fr b_val = fr::random_element(); + witness[qw] = a_val; + witness[qw + 1] = b_val; + witness[qw + 2] = a_val * b_val; + witness[qw + 3] = fr(0); + qw += 4; + } + // Range witnesses: small values that fit in 8 bits + for (int i = 0; i < 4; i++) { + witness[qw + static_cast(i)] = fr(42 + i); + } + + AcirFormat constraints{}; + constraints.honk_recursion_constraints = { recursion_constraint }; + constraints.original_opcode_indices.honk_recursion_constraints = { 0 }; + constraints.quad_constraints = quads; + constraints.original_opcode_indices.quad_constraints = { 1, 2, 3, 4 }; + constraints.range_constraints = ranges; + constraints.original_opcode_indices.range_constraints = { 5, 6, 7, 8 }; + constraints.num_acir_opcodes = 9; + constraints.max_witness_index = total_witnesses - 1; + + ProgramMetadata metadata{}; + + // Build with Mega N=1 and N=2 + AcirFormat n1_constraints = constraints; + MegaCircuitBuilder n1_builder{ + std::make_shared(), WitnessVector(witness), n1_constraints.public_inputs, false + }; + build_constraints_parallel(n1_builder, n1_constraints, metadata, /*num_threads=*/1); + + AcirFormat n2_constraints = constraints; + MegaCircuitBuilder n2_builder{ + std::make_shared(), WitnessVector(witness), n2_constraints.public_inputs, false + }; + build_constraints_parallel(n2_builder, n2_constraints, metadata, /*num_threads=*/2); + + info("Recursion+quads+ranges Mega: N1 vars=", + n1_builder.get_num_variables(), + " N2 vars=", + n2_builder.get_num_variables()); + + EXPECT_TRUE(CircuitChecker::check(n1_builder)) << "N=1 CircuitChecker failed"; + EXPECT_TRUE(CircuitChecker::check(n2_builder)) << "N=2 CircuitChecker failed"; + EXPECT_EQ(n1_builder.get_num_variables(), n2_builder.get_num_variables()); +} + +// Find the acir_tests directory relative to the source tree +std::filesystem::path find_acir_tests_dir() +{ + // Walk up from the build dir to find the repo root + // The acir_tests are at barretenberg/acir_tests/acir_tests/ + std::filesystem::path candidate = std::filesystem::current_path(); + for (int i = 0; i < 10; i++) { + auto test_dir = candidate / "barretenberg" / "acir_tests" / "acir_tests"; + if (std::filesystem::exists(test_dir)) { + return test_dir; + } + candidate = candidate.parent_path(); + } + return {}; +} + +// Collect all acir_test directories that have compiled artifacts +std::vector collect_acir_test_programs() +{ + auto acir_dir = find_acir_tests_dir(); + if (acir_dir.empty()) { + return {}; + } + std::vector programs; + for (const auto& entry : std::filesystem::directory_iterator(acir_dir)) { + if (!entry.is_directory()) + continue; + auto program_json = entry.path() / "target" / "program.json"; + auto witness_gz = entry.path() / "target" / "witness.gz"; + if (std::filesystem::exists(program_json) && std::filesystem::exists(witness_gz)) { + programs.push_back(entry.path()); + } + } + std::sort(programs.begin(), programs.end()); + return programs; +} + +// Check semantic equivalence between two builders: same block sizes, variable counts, +// copy cycle structure, constants, range lists, and lookup tables. +// Returns number of failures (0 = all invariants hold). +size_t check_semantic_equivalence(const std::string& label, UltraCircuitBuilder& a, UltraCircuitBuilder& b) +{ + size_t failures = 0; + + // Block sizes + auto a_blocks = a.blocks.get(); + auto b_blocks = b.blocks.get(); + for (size_t bl = 0; bl < UltraCircuitBuilder::ExecutionTrace::NUM_BLOCKS; bl++) { + if (a_blocks[bl].size() > 0 || b_blocks[bl].size() > 0) { + bool ok = (a_blocks[bl].size() == b_blocks[bl].size()); + info(label, + ": block ", + bl, + ": ", + a_blocks[bl].size(), + " vs ", + b_blocks[bl].size(), + ok ? " OK" : " MISMATCH"); + if (!ok) + failures++; + } + } + + // Variable count + { + bool ok = (a.get_num_variables() == b.get_num_variables()); + info(label, ": variables: ", a.get_num_variables(), " vs ", b.get_num_variables(), ok ? " OK" : " MISMATCH"); + if (!ok) + failures++; + } + + // Constants + { + bool ok = (a.constant_variable_indices.size() == b.constant_variable_indices.size()); + info(label, + ": constants: ", + a.constant_variable_indices.size(), + " vs ", + b.constant_variable_indices.size(), + ok ? " OK" : " MISMATCH"); + } + + // Range lists + { + bool ok = (a.range_lists.size() == b.range_lists.size()); + info(label, ": range_lists: ", a.range_lists.size(), " vs ", b.range_lists.size(), ok ? " OK" : " MISMATCH"); + } + + // Lookup tables + { + bool ok = (a.get_lookup_tables().size() == b.get_lookup_tables().size()); + info(label, + ": lookup_tables: ", + a.get_lookup_tables().size(), + " vs ", + b.get_lookup_tables().size(), + ok ? " OK" : " MISMATCH"); + } + + // Copy cycles: compare as sorted list of (value, cycle_size) pairs. + // Each cycle is a set of variables with the same real_variable_index root. + // The cycle's "value" is the field element at that root (all vars in the cycle share it). + // This checks that the same groups of variables are assert_equal'd, up to reordering. + auto collect_cycles = [](const UltraCircuitBuilder& builder) -> std::vector> { + std::map root_sizes; + for (size_t i = 0; i < builder.get_num_variables(); i++) { + root_sizes[builder.real_variable_index[i]]++; + } + std::vector> cycles; + cycles.reserve(root_sizes.size()); + for (const auto& [root, sz] : root_sizes) { + cycles.emplace_back(builder.get_variable(root), sz); + } + std::sort(cycles.begin(), cycles.end(), [](const auto& x, const auto& y) { + if (x.second != y.second) + return x.second < y.second; + return x.first < y.first; + }); + return cycles; + }; + auto a_cycles = collect_cycles(a); + auto b_cycles = collect_cycles(b); + if (a_cycles.size() != b_cycles.size()) { + info(label, ": copy cycle count mismatch: ", a_cycles.size(), " vs ", b_cycles.size()); + failures++; + } else { + size_t cycle_mismatches = 0; + for (size_t i = 0; i < a_cycles.size(); i++) { + if (a_cycles[i] != b_cycles[i]) { + cycle_mismatches++; + } + } + if (cycle_mismatches > 0) { + info(label, ": ", cycle_mismatches, " copy cycle (value, size) mismatches out of ", a_cycles.size()); + failures++; + } + } + + // Constants: same set of constant values (not just count) + { + std::set a_consts, b_consts; + for (const auto& [val, _] : a.constant_variable_indices) + a_consts.insert(val); + for (const auto& [val, _] : b.constant_variable_indices) + b_consts.insert(val); + if (a_consts != b_consts) { + info(label, ": constant value sets differ: a has ", a_consts.size(), " b has ", b_consts.size()); + failures++; + } + } + + // Range lists: same targets, same variable counts per target + if (a.range_lists.size() != b.range_lists.size()) { + info(label, ": range list count mismatch: ", a.range_lists.size(), " vs ", b.range_lists.size()); + failures++; + } + for (const auto& [target, a_rl] : a.range_lists) { + auto it = b.range_lists.find(target); + if (it == b.range_lists.end()) { + info(label, ": range target ", target, " missing from second builder"); + failures++; + } else if (a_rl.variable_indices.size() != it->second.variable_indices.size()) { + info(label, + ": range target ", + target, + " variable count mismatch: ", + a_rl.variable_indices.size(), + " vs ", + it->second.variable_indices.size()); + failures++; + } + } + + // Gate multiset comparison: for each block, collect all gate tuples (resolved wire values + + // selector values), sort them, and compare. This checks that the same gates exist in both + // circuits regardless of ordering. + { + auto a_blks = a.blocks.get(); + auto b_blks = b.blocks.get(); + for (size_t bl = 0; bl < UltraCircuitBuilder::ExecutionTrace::NUM_BLOCKS; bl++) { + if (a_blks[bl].size() != b_blks[bl].size()) { + continue; // already reported as block size mismatch + } + size_t count = a_blks[bl].size(); + if (count == 0) { + continue; + } + + // Collect gate tuples: 4 resolved wire values + all selector values + auto a_sels = a_blks[bl].get_selectors(); + auto b_sels = b_blks[bl].get_selectors(); + size_t tuple_size = 4 + a_sels.size(); + + auto collect_tuples = [&](const auto& blk, const auto& sels, const UltraCircuitBuilder& builder) { + std::vector> tuples; + tuples.reserve(count); + for (size_t i = 0; i < count; i++) { + std::vector t(tuple_size); + for (size_t w = 0; w < 4; w++) { + t[w] = builder.get_variable(blk.wires[w][i]); + } + for (size_t s = 0; s < sels.size(); s++) { + t[4 + s] = sels[s][i]; + } + tuples.push_back(std::move(t)); + } + std::sort(tuples.begin(), tuples.end()); + return tuples; + }; + + auto a_tuples = collect_tuples(a_blks[bl], a_sels, a); + auto b_tuples = collect_tuples(b_blks[bl], b_sels, b); + + if (a_tuples != b_tuples) { + info(label, ": block ", bl, " gate multiset mismatch (", count, " gates)"); + // Find first difference + size_t a_only = 0; + size_t b_only = 0; + size_t ai = 0; + size_t bi = 0; + while (ai < a_tuples.size() && bi < b_tuples.size()) { + if (a_tuples[ai] == b_tuples[bi]) { + ai++; + bi++; + } else if (a_tuples[ai] < b_tuples[bi]) { + a_only++; + ai++; + } else { + b_only++; + bi++; + } + } + a_only += a_tuples.size() - ai; + b_only += b_tuples.size() - bi; + info(label, ": block ", bl, " a_only=", a_only, " b_only=", b_only); + // Print first few differing tuples from each side + ai = 0; + bi = 0; + size_t printed_a = 0; + size_t printed_b = 0; + while (ai < a_tuples.size() && bi < b_tuples.size() && (printed_a < 3 || printed_b < 3)) { + if (a_tuples[ai] == b_tuples[bi]) { + ai++; + bi++; + } else if (a_tuples[ai] < b_tuples[bi]) { + if (printed_a < 3) { + info(" a_only[", + printed_a, + "]: w0=", + a_tuples[ai][0], + " w1=", + a_tuples[ai][1], + " w2=", + a_tuples[ai][2], + " w3=", + a_tuples[ai][3]); + printed_a++; + } + ai++; + } else { + if (printed_b < 3) { + info(" b_only[", + printed_b, + "]: w0=", + b_tuples[bi][0], + " w1=", + b_tuples[bi][1], + " w2=", + b_tuples[bi][2], + " w3=", + b_tuples[bi][3]); + printed_b++; + } + bi++; + } + } + failures++; + } + } + } + + // Lookup tables + if (a.get_lookup_tables().size() != b.get_lookup_tables().size()) { + info(label, + ": lookup table count mismatch: ", + a.get_lookup_tables().size(), + " vs ", + b.get_lookup_tables().size()); + failures++; + } + + return failures; +} + +// Check bit-identical circuits (every wire, selector, variable, and union-find entry must match). +// Returns number of mismatches (0 = identical). +size_t check_bit_identical(const std::string& label, UltraCircuitBuilder& a, UltraCircuitBuilder& b) +{ + size_t mismatches = 0; + + auto a_blocks = a.blocks.get(); + auto b_blocks = b.blocks.get(); + for (size_t bl = 0; bl < UltraCircuitBuilder::ExecutionTrace::NUM_BLOCKS; bl++) { + if (a_blocks[bl].size() != b_blocks[bl].size()) { + info(label, ": block ", bl, " size mismatch: ", a_blocks[bl].size(), " vs ", b_blocks[bl].size()); + mismatches++; + continue; + } + size_t count = a_blocks[bl].size(); + for (size_t w = 0; w < 4; w++) { + for (size_t i = 0; i < count; i++) { + if (a_blocks[bl].wires[w][i] != b_blocks[bl].wires[w][i]) + mismatches++; + } + } + auto a_sels = a_blocks[bl].get_selectors(); + auto b_sels = b_blocks[bl].get_selectors(); + for (size_t s = 0; s < a_sels.size(); s++) { + for (size_t i = 0; i < count; i++) { + if (a_sels[s][i] != b_sels[s][i]) + mismatches++; + } + } + } + + if (a.get_num_variables() != b.get_num_variables()) { + info(label, ": variable count mismatch"); + mismatches++; + } else { + for (size_t i = 0; i < a.get_num_variables(); i++) { + if (a.real_variable_index[i] != b.real_variable_index[i]) + mismatches++; + } + } + + return mismatches; +} + +// Parameterized test that runs the 3-way comparison on every acir_test program. +class AcirTestParallelEquivalence : public ::testing::TestWithParam { + protected: + static void SetUpTestSuite() { bb::srs::init_file_crs_factory(bb::srs::bb_crs_path()); } +}; + +TEST_P(AcirTestParallelEquivalence, SequentialN1N2) +{ + auto test_dir = GetParam(); + std::string test_name = test_dir.filename().string(); + auto program_path = test_dir / "target" / "program.json"; + auto witness_path = test_dir / "target" / "witness.gz"; + + // Load bytecode and witness + auto bytecode = get_bytecode(program_path.string()); + AcirFormat constraints = circuit_buf_to_acir_format(std::move(bytecode)); + auto witness_buf = gunzip(witness_path.string()); + WitnessVector witness = witness_buf_to_witness_vector(std::move(witness_buf)); + + // Print constraint breakdown for diagnostics + info(" quad=", + constraints.quad_constraints.size(), + " big_quad=", + constraints.big_quad_constraints.size(), + " logic=", + constraints.logic_constraints.size(), + " range=", + constraints.range_constraints.size(), + " sha256=", + constraints.sha256_compression.size(), + " ecdsa_k1=", + constraints.ecdsa_k1_constraints.size(), + " ecdsa_r1=", + constraints.ecdsa_r1_constraints.size(), + " poseidon2=", + constraints.poseidon2_constraints.size(), + " block=", + constraints.block_constraints.size(), + " msm=", + constraints.multi_scalar_mul_constraints.size(), + " ec_add=", + constraints.ec_add_constraints.size(), + " aes128=", + constraints.aes128_constraints.size()); + + // Skip circuits with no parallelizable constraints (e.g., brillig-only programs) + bool has_constraints = !constraints.quad_constraints.empty() || !constraints.big_quad_constraints.empty() || + !constraints.logic_constraints.empty() || !constraints.range_constraints.empty() || + !constraints.sha256_compression.empty() || !constraints.ecdsa_k1_constraints.empty() || + !constraints.ecdsa_r1_constraints.empty() || !constraints.poseidon2_constraints.empty() || + !constraints.multi_scalar_mul_constraints.empty() || + !constraints.ec_add_constraints.empty() || !constraints.aes128_constraints.empty() || + !constraints.blake2s_constraints.empty() || !constraints.blake3_constraints.empty() || + !constraints.keccak_permutations.empty(); + if (!has_constraints) { + GTEST_SKIP() << "No parallelizable constraints"; + } + + // Skip recursion programs (need pre-computed proof data not available in this test) + if (!constraints.honk_recursion_constraints.empty() || !constraints.avm_recursion_constraints.empty() || + !constraints.hn_recursion_constraints.empty() || !constraints.chonk_recursion_constraints.empty()) { + GTEST_SKIP() << "Recursion constraints not supported in this test"; + } + + // 1. Build sequentially via create_circuit (uses build_constraints) + AcirProgram seq_program{ constraints, WitnessVector(witness) }; + auto seq_builder = create_circuit(seq_program, ProgramMetadata{}); + + // 2. Build via parallel path with N=1 + AcirFormat n1_constraints = constraints; + UltraCircuitBuilder n1_builder{ WitnessVector(witness), n1_constraints.public_inputs, false }; + build_constraints_parallel(n1_builder, n1_constraints, ProgramMetadata{}, /*num_threads=*/1); + + // 3. Build via parallel path with N=2 + AcirFormat n2_constraints = constraints; + UltraCircuitBuilder n2_builder{ WitnessVector(witness), n2_constraints.public_inputs, false }; + build_constraints_parallel(n2_builder, n2_constraints, ProgramMetadata{}, /*num_threads=*/2); + + // Print block sizes for all three builders + { + auto sb = seq_builder.blocks.get(); + auto n1b = n1_builder.blocks.get(); + auto n2b = n2_builder.blocks.get(); + for (size_t bl = 0; bl < UltraCircuitBuilder::ExecutionTrace::NUM_BLOCKS; bl++) { + if (sb[bl].size() > 0 || n1b[bl].size() > 0 || n2b[bl].size() > 0) { + info(" block ", bl, ": seq=", sb[bl].size(), " n1=", n1b[bl].size(), " n2=", n2b[bl].size()); + } + } + info(" vars: seq=", + seq_builder.get_num_variables(), + " n1=", + n1_builder.get_num_variables(), + " n2=", + n2_builder.get_num_variables()); + info(" constants: seq=", + seq_builder.constant_variable_indices.size(), + " n1=", + n1_builder.constant_variable_indices.size(), + " n2=", + n2_builder.constant_variable_indices.size()); + info(" range_lists: seq=", + seq_builder.range_lists.size(), + " n1=", + n1_builder.range_lists.size(), + " n2=", + n2_builder.range_lists.size()); + for (const auto& [target, rl] : seq_builder.range_lists) { + auto n1_it = n1_builder.range_lists.find(target); + size_t n1_count = (n1_it != n1_builder.range_lists.end()) ? n1_it->second.variable_indices.size() : 0; + info(" range ", target, ": seq=", rl.variable_indices.size(), " n1=", n1_count); + } + // Check for range lists in n1 that aren't in seq + for (const auto& [target, rl] : n1_builder.range_lists) { + if (seq_builder.range_lists.find(target) == seq_builder.range_lists.end()) { + info(" range ", target, ": seq=MISSING n1=", rl.variable_indices.size()); + } + } + } + + // All three must pass circuit checker + bool seq_ok = CircuitChecker::check(seq_builder); + bool n1_ok = CircuitChecker::check(n1_builder); + bool n2_ok = CircuitChecker::check(n2_builder); + EXPECT_TRUE(seq_ok) << test_name << ": sequential CircuitChecker failed"; + EXPECT_TRUE(n1_ok) << test_name << ": N=1 CircuitChecker failed"; + EXPECT_TRUE(n2_ok) << test_name << ": N=2 CircuitChecker failed"; + + // Sequential vs N=1: semantic equivalence (same constraints, different order) + size_t seq_n1_failures = check_semantic_equivalence(test_name + " seq-vs-n1", seq_builder, n1_builder); + EXPECT_EQ(seq_n1_failures, 0) << test_name << ": sequential vs N=1 semantic equivalence failed"; + + // Sequential vs N=2: semantic equivalence + size_t seq_n2_failures = check_semantic_equivalence(test_name + " seq-vs-n2", seq_builder, n2_builder); + EXPECT_EQ(seq_n2_failures, 0) << test_name << ": sequential vs N=2 semantic equivalence failed"; + + // N=1 vs N=2: must be bit-identical + size_t n1_n2_mismatches = check_bit_identical(test_name + " n1-vs-n2", n1_builder, n2_builder); + if (n1_n2_mismatches > 0) { + // Print first few wire mismatches + auto n1b = n1_builder.blocks.get(); + auto n2b = n2_builder.blocks.get(); + size_t printed = 0; + for (size_t b = 0; b < UltraCircuitBuilder::ExecutionTrace::NUM_BLOCKS && printed < 5; b++) { + size_t count = std::min(n1b[b].size(), n2b[b].size()); + for (size_t w = 0; w < 4 && printed < 5; w++) { + for (size_t i = 0; i < count && printed < 5; i++) { + if (n1b[b].wires[w][i] != n2b[b].wires[w][i]) { + info(" WIRE DIFF block=", + b, + " gate=", + i, + " wire=", + w, + " n1=", + n1b[b].wires[w][i], + " n2=", + n2b[b].wires[w][i]); + printed++; + } + } + } + } + // Print first few real_variable_index mismatches + size_t num_vars = std::min(n1_builder.get_num_variables(), n2_builder.get_num_variables()); + printed = 0; + for (size_t i = 0; i < num_vars && printed < 5; i++) { + if (n1_builder.real_variable_index[i] != n2_builder.real_variable_index[i]) { + info(" REAL_VAR_IDX DIFF var=", + i, + " n1=", + n1_builder.real_variable_index[i], + " n2=", + n2_builder.real_variable_index[i]); + printed++; + } + } + } + EXPECT_EQ(n1_n2_mismatches, 0) << test_name << ": N=1 vs N=2 bit-identical check failed"; +} + +INSTANTIATE_TEST_SUITE_P(AcirTests, + AcirTestParallelEquivalence, + ::testing::ValuesIn(collect_acir_test_programs()), + [](const ::testing::TestParamInfo& info) { + return info.param.filename().string(); + }); diff --git a/barretenberg/cpp/src/barretenberg/honk/execution_trace/execution_trace_block.hpp b/barretenberg/cpp/src/barretenberg/honk/execution_trace/execution_trace_block.hpp index 609c2c380358..47be60cb4c38 100644 --- a/barretenberg/cpp/src/barretenberg/honk/execution_trace/execution_trace_block.hpp +++ b/barretenberg/cpp/src/barretenberg/honk/execution_trace/execution_trace_block.hpp @@ -19,6 +19,19 @@ namespace bb { +// Thread-local index for parallel circuit construction. Used by Selector, ExecutionTraceBlock, +// and CircuitBuilderBase to route operations through per-thread cursors. +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +inline thread_local size_t parallel_thread_idx = 0; +inline void set_parallel_thread_index(size_t idx) +{ + parallel_thread_idx = idx; +} +inline size_t get_parallel_thread_index() +{ + return parallel_thread_idx; +} + #ifdef CHECK_CIRCUIT_STACKTRACES struct BbStackTrace : backward::StackTrace { BbStackTrace() { load_here(32); } @@ -119,6 +132,46 @@ template class Selector { * @brief Release all memory held by this selector. */ virtual void free_memory() {} + + /** + * @brief Enable cursor mode for a specific thread. + * @details Used for parallel circuit construction where blocks are pre-allocated and threads write at + * pre-determined offsets. The underlying storage must already be sized to accommodate the writes. + * Thread index is set via set_active_thread_index() before processing opcodes. + */ + void enable_cursor_mode(size_t thread_idx, size_t start) + { + if (thread_idx >= cursors_.size()) { + cursors_.resize(thread_idx + 1, CURSOR_DISABLED); + } + cursors_[thread_idx] = start; + } + + // Legacy single-thread interface (uses thread index 0) + void enable_cursor_mode(size_t start) { enable_cursor_mode(0, start); } + + void disable_cursor_mode(size_t thread_idx) + { + if (thread_idx < cursors_.size()) { + cursors_[thread_idx] = CURSOR_DISABLED; + } + } + void disable_cursor_mode() { disable_cursor_mode(0); } + + bool is_cursor_mode() const { return active_cursor() != CURSOR_DISABLED; } + + size_t active_cursor() const + { + auto idx = get_parallel_thread_index(); + return (cursors_.empty() || idx >= cursors_.size()) ? CURSOR_DISABLED : cursors_[idx]; + } + + size_t& active_cursor_ref() { return cursors_[get_parallel_thread_index()]; } + + static constexpr size_t CURSOR_DISABLED = std::numeric_limits::max(); + + protected: + std::vector cursors_; // per-thread cursors }; /** @@ -134,13 +187,21 @@ template class ZeroSelector : public Selector { void emplace_back(int value) override { BB_ASSERT_EQ(value, 0, "Calling ZeroSelector::emplace_back with a non zero value."); - size_++; + if (this->is_cursor_mode()) { + this->active_cursor_ref()++; + } else { + size_++; + } } void push_back(const FF& value) override { BB_ASSERT(value.is_zero()); - size_++; + if (this->is_cursor_mode()) { + this->active_cursor_ref()++; + } else { + size_++; + } } void set(size_t, int) override { BB_ASSERT(false, "ZeroSelector::set should not be called"); } @@ -179,8 +240,22 @@ template class SlabVectorSelector : public Selector { public: using Selector::emplace_back; - void emplace_back(int i) override { data.emplace_back(i); } - void push_back(const FF& value) override { data.push_back(value); } + void emplace_back(int i) override + { + if (this->is_cursor_mode()) { + data[this->active_cursor_ref()++] = i; + } else { + data.emplace_back(i); + } + } + void push_back(const FF& value) override + { + if (this->is_cursor_mode()) { + data[this->active_cursor_ref()++] = value; + } else { + data.push_back(value); + } + } void set(size_t idx, int i) override { data[idx] = i; } void set(size_t idx, const FF& value) override { data[idx] = value; } void resize(size_t new_size) override { data.resize(new_size); } @@ -246,6 +321,14 @@ template class ExecutionTraceBlock { size_t cached_size_ = 0; // set by free_data() so size() works after freeing bool data_freed_ = false; // true after free_data() has been called uint32_t trace_offset_ = std::numeric_limits::max(); // where this block starts in the trace + std::vector wire_cursors_; // per-thread wire cursors + + size_t wire_active_cursor() const + { + auto idx = get_parallel_thread_index(); + return (wire_cursors_.empty() || idx >= wire_cursors_.size()) ? Selector::CURSOR_DISABLED + : wire_cursors_[idx]; + } uint32_t trace_offset() const { @@ -257,6 +340,65 @@ template class ExecutionTraceBlock { size_t size() const { return data_freed_ ? cached_size_ : std::get<0>(this->wires).size(); } + /** + * @brief Get the index of the gate most recently written (via populate_wires). + * @details In cursor mode, populate_wires writes at the cursor and then increments it, + * so the last gate is at cursor - 1. In normal mode, it's size() - 1 as usual. + * Must be called immediately after populate_wires (before any other writes to this block). + */ + size_t last_gate_index() const + { + size_t wc = wire_active_cursor(); + if (wc != Selector::CURSOR_DISABLED) { + return wc - 1; + } + return size() - 1; + } + + /** + * @brief Get the index where the next gate will be written. + * @details In cursor mode, returns the current cursor position. In normal mode, returns size(). + */ + size_t next_gate_index() const + { + size_t wc = wire_active_cursor(); + if (wc != Selector::CURSOR_DISABLED) { + return wc; + } + return size(); + } + + /** + * @brief Enable cursor mode for a thread: subsequent gate writes go to position `start` and advance. + * @details The block's wires and selectors must already be sized to accommodate the writes. + * Used for parallel circuit construction where threads write to pre-allocated regions. + */ + void enable_cursor_mode(size_t thread_idx, size_t start) + { + if (thread_idx >= wire_cursors_.size()) { + wire_cursors_.resize(thread_idx + 1, Selector::CURSOR_DISABLED); + } + wire_cursors_[thread_idx] = start; + for (auto& sel : get_selectors()) { + sel.enable_cursor_mode(thread_idx, start); + } + } + + // Legacy single-thread interface + void enable_cursor_mode(size_t start) { enable_cursor_mode(0, start); } + + void disable_cursor_mode(size_t thread_idx) + { + if (thread_idx < wire_cursors_.size()) { + wire_cursors_[thread_idx] = Selector::CURSOR_DISABLED; + } + for (auto& sel : get_selectors()) { + sel.disable_cursor_mode(thread_idx); + } + } + + void disable_cursor_mode() { disable_cursor_mode(0); } + #ifdef TRACY_HACK_GATES_AS_MEMORY ~ExecutionTraceBlock() { @@ -295,10 +437,19 @@ template class ExecutionTraceBlock { this->stack_traces.populate(); #endif this->tracy_gate(); - this->wires[0].emplace_back(idx_1); - this->wires[1].emplace_back(idx_2); - this->wires[2].emplace_back(idx_3); - this->wires[3].emplace_back(idx_4); + size_t wc = wire_active_cursor(); + if (wc != Selector::CURSOR_DISABLED) { + this->wires[0][wc] = idx_1; + this->wires[1][wc] = idx_2; + this->wires[2][wc] = idx_3; + this->wires[3][wc] = idx_4; + wire_cursors_[get_parallel_thread_index()]++; + } else { + this->wires[0].emplace_back(idx_1); + this->wires[1].emplace_back(idx_2); + this->wires[2].emplace_back(idx_3); + this->wires[3].emplace_back(idx_4); + } } auto& w_l() { return std::get<0>(this->wires); }; diff --git a/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/circuit_builder_base.hpp b/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/circuit_builder_base.hpp index 3a556e5a6309..c9b031bc0716 100644 --- a/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/circuit_builder_base.hpp +++ b/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/circuit_builder_base.hpp @@ -9,6 +9,7 @@ #include "barretenberg/ecc/curves/bn254/bn254.hpp" #include "barretenberg/ecc/curves/bn254/fr.hpp" #include "barretenberg/ecc/curves/grumpkin/grumpkin.hpp" +#include "barretenberg/honk/execution_trace/execution_trace_block.hpp" // for get_parallel_thread_index #include "barretenberg/honk/execution_trace/gate_data.hpp" #include "barretenberg/public_input_component/public_component_key.hpp" #include "barretenberg/serialize/msgpack.hpp" @@ -52,6 +53,44 @@ template class CircuitBuilderBase { size_t _num_gates = 0; + public: + // Cursor for parallel variable allocation. When enabled, add_variable writes at cursor position + // instead of appending. The variable vectors must be pre-sized to accommodate the writes. + static constexpr uint32_t VARIABLE_CURSOR_DISABLED = UINT32_MAX; + + // Deferred assert_equal entries for parallel construction. In cursor mode, assert_equal calls + // are recorded per-task and replayed in deterministic task order after all threads join. + // This prevents nondeterministic union-find results when multiple threads assert_equal on + // the same shared ACIR witness. + struct DeferredAssertEqual { + uint32_t a_variable_idx; + uint32_t b_variable_idx; + std::string msg; + bool operator==(const DeferredAssertEqual&) const = default; + }; + std::vector> deferred_assert_equals_; // per-task + + void init_deferred_assert_equal_buffers(size_t num_tasks) { deferred_assert_equals_.resize(num_tasks); } + + // Set which task index the current thread is executing (for assert_equal deferral). + // Thread-local so concurrent threads don't overwrite each other's task index. + void set_current_task_index(size_t task_idx) { current_task_idx_ = task_idx; } + static inline thread_local size_t current_task_idx_ = 0; + + void apply_deferred_assert_equals() + { + // Replay in task order (0, 1, 2, ...) for deterministic union-find results + for (auto& task_buf : deferred_assert_equals_) { + for (auto& entry : task_buf) { + assert_equal(entry.a_variable_idx, entry.b_variable_idx, entry.msg); + } + task_buf.clear(); + } + } + + private: + std::vector variable_cursors_; // per-thread variable cursors + /** * @brief Update all variables from index in equivalence class to have real variable new_real_index * @param index The index of a variable in the class we're updating @@ -144,6 +183,10 @@ template class CircuitBuilderBase { void increment_num_gates(size_t count = 1) { BB_ASSERT(!circuit_finalized, "Cannot add gates after circuit is finalized"); + // In cursor mode, gate count is pre-computed; skip to avoid races in parallel construction + if (get_variable_cursor() != VARIABLE_CURSOR_DISABLED) { + return; + } _num_gates += count; } @@ -188,6 +231,8 @@ template class CircuitBuilderBase { } const std::vector& public_inputs() const { return _public_inputs; }; + const std::vector& get_next_var_index() const { return next_var_index; } + const std::vector& get_prev_var_index() const { return prev_var_index; } /** * @brief Set the _public_inputs_finalized to true to prevent any new public inputs from being added @@ -211,6 +256,50 @@ template class CircuitBuilderBase { */ virtual uint32_t add_variable(const FF& in); + /** + * @brief Enable variable cursor mode for parallel construction. + * @details When enabled, add_variable writes at the cursor position instead of appending. + * The variables/real_variable_index/next_var_index/prev_var_index/real_variable_tags vectors + * must be pre-sized to accommodate the writes. + */ + void enable_variable_cursor(size_t thread_idx, uint32_t start) + { + if (thread_idx >= variable_cursors_.size()) { + variable_cursors_.resize(thread_idx + 1, VARIABLE_CURSOR_DISABLED); + } + variable_cursors_[thread_idx] = start; + } + // Legacy single-thread interface + void enable_variable_cursor(uint32_t start) { enable_variable_cursor(0, start); } + + void disable_variable_cursor(size_t thread_idx) + { + if (thread_idx < variable_cursors_.size()) { + variable_cursors_[thread_idx] = VARIABLE_CURSOR_DISABLED; + } + } + void disable_variable_cursor() { disable_variable_cursor(0); } + + uint32_t get_variable_cursor() const + { + auto idx = get_parallel_thread_index(); + return (variable_cursors_.empty() || idx >= variable_cursors_.size()) ? VARIABLE_CURSOR_DISABLED + : variable_cursors_[idx]; + } + + /** + * @brief Pre-allocate variable storage for parallel construction. + * @param total_size The total number of variables (existing + new from all threads). + */ + void resize_variables(size_t total_size) + { + variables.resize(total_size); + real_variable_index.resize(total_size); + next_var_index.resize(total_size); + prev_var_index.resize(total_size); + real_variable_tags.resize(total_size); + } + // Disallow add_variable for non-FF types to prevent implicit conversions (specifically, using indices rather // than values) template uint32_t add_variable(const OT& in) = delete; diff --git a/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/circuit_builder_base_impl.hpp b/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/circuit_builder_base_impl.hpp index 2571bf23179a..178c60309114 100644 --- a/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/circuit_builder_base_impl.hpp +++ b/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/circuit_builder_base_impl.hpp @@ -45,6 +45,17 @@ void CircuitBuilderBase::update_real_variable_indices(uint32_t index, uint3 template uint32_t CircuitBuilderBase::add_variable(const FF& in) { + uint32_t cursor = get_variable_cursor(); + if (cursor != VARIABLE_CURSOR_DISABLED) { + auto thread_idx = get_parallel_thread_index(); + const uint32_t index = variable_cursors_[thread_idx]++; + variables[index] = in; + real_variable_index[index] = index; + next_var_index[index] = REAL_VARIABLE; + prev_var_index[index] = FIRST_VARIABLE_IN_CLASS; + real_variable_tags[index] = DEFAULT_TAG; + return index; + } variables.emplace_back(in); const uint32_t index = static_cast(variables.size()) - 1U; real_variable_index.emplace_back(index); @@ -114,6 +125,13 @@ void CircuitBuilderBase::assert_equal(const uint32_t a_variable_idx, const uint32_t b_variable_idx, std::string const& msg) { + // In cursor mode, defer assert_equal to avoid nondeterministic union-find results + // when multiple threads modify chains rooted at the same shared witness. + // Deferred entries are stored per-task and replayed in task order after all threads join. + if (get_variable_cursor() != VARIABLE_CURSOR_DISABLED) { + deferred_assert_equals_[current_task_idx_].push_back({ a_variable_idx, b_variable_idx, msg }); + return; + } assert_valid_variables({ a_variable_idx, b_variable_idx }); bool values_equal = (get_variable(a_variable_idx) == get_variable(b_variable_idx)); if (!values_equal && !failed()) { diff --git a/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/rom_ram_logic.cpp b/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/rom_ram_logic.cpp index a4ff064e1c94..a0bdffdcf91f 100644 --- a/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/rom_ram_logic.cpp +++ b/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/rom_ram_logic.cpp @@ -13,6 +13,14 @@ namespace bb { template size_t RomRamLogic_::create_ROM_array(const size_t array_size) { + // In cursor mode (parallel construction), arrays are pre-created during setup. + // Return the next pre-assigned ID for this thread and advance the cursor. + if (rom_cursor_active()) { + size_t id = rom_id_cursors_[get_parallel_thread_index()]++; + BB_ASSERT(id < rom_arrays.size()); + BB_ASSERT(rom_arrays[id].state.size() == array_size); + return id; + } RomTranscript new_transcript; for (size_t i = 0; i < array_size; ++i) { new_transcript.state.emplace_back( @@ -158,7 +166,7 @@ void RomRamLogic_::create_ROM_gate(CircuitBuilder* builder, RomR builder->blocks.memory.populate_wires( record.index_witness, record.value_column1_witness, record.value_column2_witness, record.record_witness); // Note: record the index into the memory block that contains the RAM/ROM gates - record.gate_index = builder->blocks.memory.size() - 1; + record.gate_index = builder->blocks.memory.last_gate_index(); builder->check_selector_length_consistency(); builder->increment_num_gates(); } @@ -173,7 +181,7 @@ void RomRamLogic_::create_sorted_ROM_gate(CircuitBuilder* builde builder->blocks.memory.populate_wires( record.index_witness, record.value_column1_witness, record.value_column2_witness, record.record_witness); // Note: record the index into the memory block that contains the RAM/ROM gates - record.gate_index = builder->blocks.memory.size() - 1; + record.gate_index = builder->blocks.memory.last_gate_index(); builder->check_selector_length_consistency(); builder->increment_num_gates(); } @@ -276,6 +284,13 @@ template void RomRamLogic_::process_RO template size_t RomRamLogic_::create_RAM_array(const size_t array_size) { + // In cursor mode (parallel construction), arrays are pre-created during setup. + if (ram_cursor_active()) { + size_t id = ram_id_cursors_[get_parallel_thread_index()]++; + BB_ASSERT(id < ram_arrays.size()); + BB_ASSERT(ram_arrays[id].state.size() == array_size); + return id; + } RamTranscript new_transcript; for (size_t i = 0; i < array_size; ++i) { new_transcript.state.emplace_back(UNINITIALIZED_MEMORY_RECORD); @@ -418,7 +433,7 @@ void RomRamLogic_::create_RAM_gate(CircuitBuilder* builder, RamR record.index_witness, record.timestamp_witness, record.value_witness, record.record_witness); // Note: record the index into the block that contains the RAM/ROM gates - record.gate_index = builder->blocks.memory.size() - 1; + record.gate_index = builder->blocks.memory.last_gate_index(); builder->increment_num_gates(); } @@ -430,7 +445,7 @@ void RomRamLogic_::create_sorted_RAM_gate(CircuitBuilder* builde builder->blocks.memory.populate_wires( record.index_witness, record.timestamp_witness, record.value_witness, record.record_witness); // Note: record the index into the memory block that contains the RAM/ROM gates - record.gate_index = builder->blocks.memory.size() - 1; + record.gate_index = builder->blocks.memory.last_gate_index(); builder->check_selector_length_consistency(); builder->increment_num_gates(); } @@ -442,7 +457,7 @@ void RomRamLogic_::create_final_sorted_RAM_gate(CircuitBuilder* { record.record_witness = builder->add_variable(FF(0)); // Note: record the index into the block that contains the RAM/ROM gates - record.gate_index = builder->blocks.memory.size(); // no -1 since we _haven't_ added the gate yet + record.gate_index = builder->blocks.memory.next_gate_index(); // index where the gate _will_ be written // Create a final gate with all selectors zero (hence unconstrained). In particular, the `MEMORY_SELECTORS` are not // on. Wire values are accessed by the previous RAM gate via shifted wires. diff --git a/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/rom_ram_logic.hpp b/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/rom_ram_logic.hpp index 999ada11850a..8e405a88d144 100644 --- a/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/rom_ram_logic.hpp +++ b/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/rom_ram_logic.hpp @@ -149,6 +149,29 @@ template class RomRamLogic_ { */ std::vector rom_arrays; + // Per-thread ROM/RAM ID cursors for parallel construction. When enabled, create_ROM_array/ + // create_RAM_array return the cursor value and increment it instead of pushing to the vectors. + // Arrays must be pre-created during the setup phase before enabling cursors. + std::vector rom_id_cursors_; // per-thread + std::vector ram_id_cursors_; // per-thread + + void enable_rom_cursor(size_t thread_idx, size_t start) + { + if (thread_idx >= rom_id_cursors_.size()) { + rom_id_cursors_.resize(thread_idx + 1, 0); + } + rom_id_cursors_[thread_idx] = start; + } + void enable_ram_cursor(size_t thread_idx, size_t start) + { + if (thread_idx >= ram_id_cursors_.size()) { + ram_id_cursors_.resize(thread_idx + 1, 0); + } + ram_id_cursors_[thread_idx] = start; + } + bool rom_cursor_active() const { return !rom_id_cursors_.empty(); } + bool ram_cursor_active() const { return !ram_id_cursors_.empty(); } + RomRamLogic_() = default; // ROM operations diff --git a/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/ultra_circuit_builder.cpp b/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/ultra_circuit_builder.cpp index da88199681e8..1cc12be2a975 100644 --- a/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/ultra_circuit_builder.cpp +++ b/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/ultra_circuit_builder.cpp @@ -378,15 +378,25 @@ void UltraCircuitBuilder_::create_ecc_add_gate(const ecc_add_gat // The elliptic curve relation assumes q_sign² = 1 (see elliptic_relation.hpp) const FF q_sign = in.is_addition ? FF(1) : FF(-1); - // Determine whether we can fuse this addition operation into the previous gate in the block - bool can_fuse_into_previous_gate = - block.size() > 0 && /* a previous gate exists in the block */ - block.w_r()[block.size() - 1] == in.x1 && /* output x coord of previous gate is input of this one */ - block.w_o()[block.size() - 1] == in.y1; /* output y coord of previous gate is input of this one */ + // Determine whether we can fuse this addition into the previous gate in the block. + // In cursor mode, use cursor position (not block.size() which returns pre-allocated total). + // NOTE: For future work-stealing parallelism where task execution order may differ, fusion + // across task boundaries must be handled carefully to maintain determinism. + size_t cursor = block.wire_active_cursor(); + bool can_fuse_into_previous_gate; + size_t prev_idx; + if (cursor != Selector::CURSOR_DISABLED) { + prev_idx = cursor - 1; + can_fuse_into_previous_gate = cursor > 0 && block.w_r()[prev_idx] == in.x1 && block.w_o()[prev_idx] == in.y1; + } else { + prev_idx = block.size() - 1; + can_fuse_into_previous_gate = + block.size() > 0 && block.w_r()[prev_idx] == in.x1 && block.w_o()[prev_idx] == in.y1; + } if (can_fuse_into_previous_gate) { - block.q_1().set(block.size() - 1, q_sign); // set q_sign of previous gate - block.q_elliptic().set(block.size() - 1, 1); // set q_ecc of previous gate to 1 + block.q_1().set(prev_idx, q_sign); // set q_sign of previous gate + block.q_elliptic().set(prev_idx, 1); // set q_ecc of previous gate to 1 } else { block.populate_wires(this->zero_idx(), in.x1, in.y1, this->zero_idx()); block.q_3().emplace_back(0); @@ -427,16 +437,22 @@ void UltraCircuitBuilder_::create_ecc_dbl_gate(const ecc_dbl_gat auto& block = blocks.elliptic; - // Determine whether we can fuse this doubling operation into the previous gate in the block - bool can_fuse_into_previous_gate = - block.size() > 0 && /* a previous gate exists in the block */ - block.w_r()[block.size() - 1] == in.x1 && /* output x coord of previous gate is input of this one */ - block.w_o()[block.size() - 1] == in.y1; /* output y coord of previous gate is input of this one */ - + size_t dbl_cursor = block.wire_active_cursor(); + bool can_fuse_into_previous_gate; + size_t dbl_prev_idx; + if (dbl_cursor != Selector::CURSOR_DISABLED) { + dbl_prev_idx = dbl_cursor - 1; + can_fuse_into_previous_gate = + dbl_cursor > 0 && block.w_r()[dbl_prev_idx] == in.x1 && block.w_o()[dbl_prev_idx] == in.y1; + } else { + dbl_prev_idx = block.size() - 1; + can_fuse_into_previous_gate = + block.size() > 0 && block.w_r()[dbl_prev_idx] == in.x1 && block.w_o()[dbl_prev_idx] == in.y1; + } // If possible, update the previous gate to be the first gate in the pair, otherwise create a new gate if (can_fuse_into_previous_gate) { - block.q_elliptic().set(block.size() - 1, 1); // set q_ecc of previous gate to 1 - block.q_m().set(block.size() - 1, 1); // set q_m (q_is_double) of previous gate to 1 + block.q_elliptic().set(dbl_prev_idx, 1); // set q_ecc of previous gate to 1 + block.q_m().set(dbl_prev_idx, 1); // set q_m (q_is_double) of previous gate to 1 } else { block.populate_wires(this->zero_idx(), in.x1, in.y1, this->zero_idx()); block.q_m().emplace_back(1); @@ -484,12 +500,18 @@ uint32_t UltraCircuitBuilder_::put_constant_variable(const FF& v { if (constant_variable_indices.contains(variable)) { return constant_variable_indices.at(variable); - } else { + } + // In cursor mode (parallel construction), don't insert into the shared cache. + // New constants that weren't pre-registered get fresh variables without deduplication. + if (this->get_variable_cursor() != this->VARIABLE_CURSOR_DISABLED) { uint32_t variable_index = this->add_variable(variable); fix_witness(variable_index, variable); - constant_variable_indices.insert({ variable, variable_index }); return variable_index; } + uint32_t variable_index = this->add_variable(variable); + fix_witness(variable_index, variable); + constant_variable_indices.insert({ variable, variable_index }); + return variable_index; } /** @@ -559,7 +581,13 @@ plookup::ReadData UltraCircuitBuilder_::create_gates_f // Get basic lookup table; construct and add to builder.lookup_tables if not already present plookup::BasicTable& table = get_table(multi_table.basic_table_ids[i]); - table.lookup_gates.emplace_back(read_values.lookup_entries[i]); + // In cursor mode, defer the lookup gate entry to avoid races on table.lookup_gates + if (this->get_variable_cursor() != this->VARIABLE_CURSOR_DISABLED) { + auto tidx = get_parallel_thread_index(); + deferred_lookup_gates_.defer(tidx, { multi_table.basic_table_ids[i], read_values.lookup_entries[i] }); + } else { + table.lookup_gates.emplace_back(read_values.lookup_entries[i]); + } // Create witness variables: first lookup reuses user's input indices, subsequent create new variables const auto first_idx = is_first_lookup ? key_a_index : this->add_variable(read_values[ColumnIdx::C1][i]); @@ -756,6 +784,13 @@ void UltraCircuitBuilder_::create_small_range_constraint(const u const uint64_t target_range, std::string const msg) { + // In cursor mode, defer range constraint to avoid races on range_lists and real_variable_tags + if (this->get_variable_cursor() != this->VARIABLE_CURSOR_DISABLED) { + auto tidx = get_parallel_thread_index(); + deferred_range_constraints_.defer(tidx, { variable_index, target_range }); + return; + } + // make sure `target_range` is not too big. BB_ASSERT_GTE(MAX_SMALL_RANGE_CONSTRAINT_VAL, target_range); const bool is_out_of_range = (uint256_t(this->get_variable(variable_index)).data[0] > target_range); @@ -1522,7 +1557,8 @@ std::array UltraCircuitBuilder_::queue_partial_non_ const uint32_t hi_0_idx = this->add_variable(hi_0); const uint32_t hi_1_idx = this->add_variable(hi_1); - // Add witnesses into the multiplication cache (duplicates removed during circuit finalization) + // Add witnesses into the multiplication cache (duplicates removed during circuit finalization). + // In cursor mode, defer to per-thread buffer to avoid races on the shared vector. cached_partial_non_native_field_multiplication cache_entry{ .a = input.a, .b = input.b, @@ -1530,7 +1566,11 @@ std::array UltraCircuitBuilder_::queue_partial_non_ .hi_0 = hi_0_idx, .hi_1 = hi_1_idx, }; - cached_partial_non_native_field_multiplications.emplace_back(cache_entry); + if (this->get_variable_cursor() != this->VARIABLE_CURSOR_DISABLED) { + deferred_non_native_field_muls_.defer(get_parallel_thread_index(), cache_entry); + } else { + cached_partial_non_native_field_multiplications.emplace_back(cache_entry); + } return std::array{ lo_0_idx, hi_1_idx }; } diff --git a/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/ultra_circuit_builder.hpp b/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/ultra_circuit_builder.hpp index 2bcfae4938f7..be5c150980ae 100644 --- a/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/ultra_circuit_builder.hpp +++ b/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/ultra_circuit_builder.hpp @@ -14,7 +14,9 @@ #include "circuit_builder_base.hpp" #include "rom_ram_logic.hpp" #include +#include #include +#include #include #include "barretenberg/serialize/msgpack.hpp" @@ -200,11 +202,224 @@ class UltraCircuitBuilder_ : public CircuitBuilderBase constant_variable_indices; + /** + * @brief Per-thread buffer for deferring operations during parallel construction. + * @details Operations that modify shared builder state are buffered per-thread during + * execute_parallel and replayed sequentially after all threads join. The replay callback + * receives each entry and applies it to the builder. + */ + template struct DeferredBuffer { + std::vector> thread_buffers; + + void init(size_t num_threads) { thread_buffers.resize(num_threads); } + + void defer(size_t thread_idx, Entry&& entry) { thread_buffers[thread_idx].emplace_back(std::move(entry)); } + + void defer(size_t thread_idx, const Entry& entry) { thread_buffers[thread_idx].push_back(entry); } + + template void apply(Callback&& callback) + { + for (auto& buf : thread_buffers) { + for (auto& entry : buf) { + callback(entry); + } + buf.clear(); + } + } + }; + + struct DeferredLookupEntry { + plookup::BasicTableId table_id; + plookup::BasicTable::LookupEntry entry; + }; + struct DeferredRangeConstraint { + uint32_t variable_index; + uint64_t target_range; + }; + + DeferredBuffer deferred_lookup_gates_; + DeferredBuffer deferred_range_constraints_; + DeferredBuffer deferred_non_native_field_muls_; + + void init_deferred_buffers(size_t num_threads) + { + deferred_lookup_gates_.init(num_threads); + deferred_range_constraints_.init(num_threads); + deferred_non_native_field_muls_.init(num_threads); + } + + /** + * @brief Per-block gate counts and variable count for a task (one or more opcodes). + */ + struct TaskBlockSizes { + std::array block_sizes{}; + size_t num_variables = 0; + size_t num_rom_arrays = 0; + size_t num_ram_arrays = 0; + }; + + /** + * @brief Snapshot the current block sizes and variable count. + */ + TaskBlockSizes snapshot_block_sizes() const + { + TaskBlockSizes s; + auto block_refs = blocks.get(); + for (size_t i = 0; i < ExecutionTrace::NUM_BLOCKS; i++) { + s.block_sizes[i] = block_refs[i].size(); + } + s.num_variables = this->get_num_variables(); + return s; + } + + /** + * @brief Compute the delta between two snapshots (after - before). + */ + static TaskBlockSizes delta(const TaskBlockSizes& before, const TaskBlockSizes& after) + { + TaskBlockSizes d; + for (size_t i = 0; i < ExecutionTrace::NUM_BLOCKS; i++) { + d.block_sizes[i] = after.block_sizes[i] - before.block_sizes[i]; + } + d.num_variables = after.num_variables - before.num_variables; + return d; + } + + /** + * @brief Execute tasks in parallel on this builder. Each task is a lambda that adds gates to the builder. + * @details Pre-allocates blocks and variables based on per-task sizes, then dispatches tasks across threads + * with per-thread cursors. After joining, replays deferred lookup and range constraint operations. + * + * @param tasks Vector of lambdas, each taking (UltraCircuitBuilder_&) and adding gates + * @param task_sizes Per-task block sizes and variable counts (must match tasks.size()) + * @param num_threads Number of threads to use (tasks are distributed round-robin) + */ + void execute_parallel(const std::vector>& tasks, + const std::vector& task_sizes, + size_t num_threads) + { + BB_ASSERT(tasks.size() == task_sizes.size()); + if (tasks.empty()) { + return; + } + num_threads = std::min(num_threads, tasks.size()); + + // Compute total sizes and per-task offsets + auto base = snapshot_block_sizes(); + std::vector offsets(tasks.size()); + TaskBlockSizes running = base; + for (size_t t = 0; t < tasks.size(); t++) { + offsets[t] = running; + for (size_t b = 0; b < ExecutionTrace::NUM_BLOCKS; b++) { + running.block_sizes[b] += task_sizes[t].block_sizes[b]; + } + running.num_variables += task_sizes[t].num_variables; + running.num_rom_arrays += task_sizes[t].num_rom_arrays; + running.num_ram_arrays += task_sizes[t].num_ram_arrays; + } + + // Pre-allocate all blocks and variables to total size + auto block_refs = blocks.get(); + for (size_t b = 0; b < ExecutionTrace::NUM_BLOCKS; b++) { + auto& block = block_refs[b]; + for (auto& wire : block.wires) { + wire.resize(running.block_sizes[b], 0); + } + for (auto& sel : block.get_selectors()) { + sel.resize(running.block_sizes[b]); + } + } + this->resize_variables(running.num_variables); + init_deferred_buffers(num_threads); + this->init_deferred_assert_equal_buffers(tasks.size()); + + // Assign tasks to threads (round-robin) + std::vector> thread_tasks(num_threads); + for (size_t t = 0; t < tasks.size(); t++) { + thread_tasks[t % num_threads].push_back(t); + } + + // Pre-initialize all cursors on the main thread to avoid races on cursor vector resizing. + // For threads with multiple tasks, we set the cursor to the first task's offset; + // within the thread, cursors are updated sequentially between tasks. + auto block_refs_setup = blocks.get(); + for (size_t tid = 0; tid < num_threads; tid++) { + if (!thread_tasks[tid].empty()) { + size_t first_task = thread_tasks[tid][0]; + // Enable cursors for ALL blocks, not just ones the first task uses. + // Later tasks on this thread may use blocks the first task doesn't. + for (size_t b = 0; b < ExecutionTrace::NUM_BLOCKS; b++) { + block_refs_setup[b].enable_cursor_mode(tid, offsets[first_task].block_sizes[b]); + } + this->enable_variable_cursor(tid, static_cast(offsets[first_task].num_variables)); + rom_ram_logic.enable_rom_cursor(tid, offsets[first_task].num_rom_arrays); + rom_ram_logic.enable_ram_cursor(tid, offsets[first_task].num_ram_arrays); + } + } + + // Dispatch threads + std::vector threads; + threads.reserve(num_threads); + + for (size_t tid = 0; tid < num_threads; tid++) { + threads.emplace_back([this, tid, &tasks, &offsets, &thread_tasks]() { + set_parallel_thread_index(tid); + + for (size_t i = 0; i < thread_tasks[tid].size(); i++) { + size_t task_idx = thread_tasks[tid][i]; + + // For subsequent tasks (not the first), update cursors to this task's offsets + if (i > 0) { + auto block_refs_local = blocks.get(); + for (size_t b = 0; b < ExecutionTrace::NUM_BLOCKS; b++) { + block_refs_local[b].enable_cursor_mode(tid, offsets[task_idx].block_sizes[b]); + } + this->enable_variable_cursor(tid, static_cast(offsets[task_idx].num_variables)); + rom_ram_logic.enable_rom_cursor(tid, offsets[task_idx].num_rom_arrays); + rom_ram_logic.enable_ram_cursor(tid, offsets[task_idx].num_ram_arrays); + } + + // Execute the task + this->set_current_task_index(task_idx); + tasks[task_idx](*this); + } + + // Disable all cursors for this thread + auto block_refs_local = blocks.get(); + for (size_t b = 0; b < ExecutionTrace::NUM_BLOCKS; b++) { + block_refs_local[b].disable_cursor_mode(tid); + } + this->disable_variable_cursor(tid); + }); + } + + // Join all threads + for (auto& t : threads) { + t.join(); + } + + // Clear ROM/RAM cursors so subsequent sequential operations use normal path + rom_ram_logic.rom_id_cursors_.clear(); + rom_ram_logic.ram_id_cursors_.clear(); + + // Replay deferred operations + deferred_lookup_gates_.apply([this](auto& e) { + auto& table = get_table(e.table_id); + table.lookup_gates.emplace_back(e.entry); + }); + deferred_range_constraints_.apply( + [this](auto& e) { create_small_range_constraint(e.variable_index, e.target_range); }); + deferred_non_native_field_muls_.apply( + [this](auto& e) { cached_partial_non_native_field_multiplications.emplace_back(e); }); + this->apply_deferred_assert_equals(); + } + // Rom/Ram logic RomRamLogic rom_ram_logic; // Stores gate index of ROM/RAM reads (required by proving key) std::vector memory_read_records; + // Stores gate index of RAM writes (required by proving key) std::vector memory_write_records; // Range constraints to be batched, keyed by target_range. See create_small_range_constraint() for details. @@ -635,7 +850,14 @@ class UltraCircuitBuilder_ : public CircuitBuilderBase x*(x^-1) = 1). */ - void update_used_witnesses(uint32_t var_idx) { used_witnesses.emplace_back(var_idx); } + void update_used_witnesses(uint32_t var_idx) + { + // Skip in cursor mode to avoid races on shared used_witnesses vector + if (this->get_variable_cursor() != this->VARIABLE_CURSOR_DISABLED) { + return; + } + used_witnesses.emplace_back(var_idx); + } /** * @brief Add a list of witness indices to the boomerang exclusion list @@ -646,6 +868,10 @@ class UltraCircuitBuilder_ : public CircuitBuilderBase& used_indices) { + // Skip in cursor mode to avoid races on shared used_witnesses vector + if (this->get_variable_cursor() != this->VARIABLE_CURSOR_DISABLED) { + return; + } used_witnesses.reserve(used_witnesses.size() + used_indices.size()); for (const auto& it : used_indices) { used_witnesses.emplace_back(it); @@ -659,7 +885,13 @@ class UltraCircuitBuilder_ : public CircuitBuilderBaseget_variable_cursor() != this->VARIABLE_CURSOR_DISABLED) { + return; + } + finalize_witnesses.insert(var_idx); + } /** * @brief Add a list of witness indices to the finalize exclusion list @@ -670,6 +902,9 @@ class UltraCircuitBuilder_ : public CircuitBuilderBase& finalize_indices) { + if (this->get_variable_cursor() != this->VARIABLE_CURSOR_DISABLED) { + return; + } for (const auto& it : finalize_indices) { finalize_witnesses.insert(it); }