diff --git a/barretenberg/cpp/src/barretenberg/dsl/acir_format/acir_format.cpp b/barretenberg/cpp/src/barretenberg/dsl/acir_format/acir_format.cpp
index 15ec2cf72963..8bf61a0f2bfb 100644
--- a/barretenberg/cpp/src/barretenberg/dsl/acir_format/acir_format.cpp
+++ b/barretenberg/cpp/src/barretenberg/dsl/acir_format/acir_format.cpp
@@ -251,4 +251,408 @@ template <> MegaCircuitBuilder create_circuit(AcirProgram& program, const Progra
 template void build_constraints<UltraCircuitBuilder>(UltraCircuitBuilder&, AcirFormat&, const ProgramMetadata&);
 template void build_constraints<MegaCircuitBuilder>(MegaCircuitBuilder&, AcirFormat&, const ProgramMetadata&);
 
+/**
+ * @brief Profile data for a constraint type, extracted from a throwaway builder.
+ * @details Eventually this will be a compile-time table lookup. For now, it's computed
+ * by running constraints on a throwaway builder and extracting the resulting state.
+ */
+template <typename Builder> struct ConstraintProfile {
+    typename Builder::TaskBlockSizes block_sizes;
+    std::vector<bb::fr> constants;                // constant values to pre-register
+    std::vector<uint64_t> range_list_targets;     // range list target ranges to pre-create
+    std::vector<plookup::BasicTableId> table_ids; // lookup tables to pre-create
+    size_t num_rom_arrays_per_instance = 0;       // ROM arrays created per constraint instance
+    size_t num_ram_arrays_per_instance = 0;       // RAM arrays created per constraint instance
+    std::vector<size_t> rom_array_sizes;          // sizes of ROM arrays created per instance
+    std::vector<size_t> ram_array_sizes;          // sizes of RAM arrays created per instance
+};
+
+/**
+ * @brief Profile a constraint type by running it on a throwaway builder and extracting cache state.
+ * @details Runs two instances: the first triggers one-time setup, the second measures steady-state cost.
+ * Extracts all constants, range list targets, and lookup table IDs that the constraint type needs.
+ * This simulates the eventual table lookup.
+ */
+template <typename Builder, typename ConstraintType, typename Handler>
+ConstraintProfile<Builder> profile_constraint_type(ConstraintType representative,
+                                                   Handler&& handler,
+                                                   size_t num_witnesses)
+{
+    ConstraintProfile<Builder> profile;
+
+    // Phase A: Run one instance on a throwaway builder to discover setup needs (constants, range lists, etc.)
+    WitnessVector dummy_witness(num_witnesses, bb::fr(0));
+    // Construct throwaway builder — Mega needs a default op_queue, Ultra uses the 3-arg constructor
+    auto make_builder = [&]() -> Builder {
+        if constexpr (std::is_same_v<Builder, UltraCircuitBuilder>) {
+            return Builder(dummy_witness, {}, /*is_write_vk_mode=*/true);
+        } else {
+            return Builder(std::make_shared<ECCOpQueue>(), dummy_witness, {}, /*is_write_vk_mode=*/true);
+        }
+    };
+
+    Builder warmup_builder = make_builder();
+    handler(warmup_builder, representative);
+
+    // Extract setup data from the warmup builder
+    for (const auto& [value, _] : warmup_builder.constant_variable_indices) {
+        profile.constants.push_back(value);
+    }
+    for (const auto& [target_range, _] : warmup_builder.range_lists) {
+        profile.range_list_targets.push_back(target_range);
+    }
+    for (const auto& table : warmup_builder.get_lookup_tables()) {
+        profile.table_ids.push_back(table.id);
+    }
+
+    // Phase B: Measure steady-state cost on a SEPARATE builder pre-populated with setup data.
+    // This ensures no cross-instance gate fusion at the boundary, matching cursor-mode behavior
+    // where each task starts with no prior gates in its block region.
+    Builder measure_builder = make_builder();
+    for (const auto& value : profile.constants) {
+        measure_builder.put_constant_variable(value);
+    }
+    for (const auto target_range : profile.range_list_targets) {
+        if (measure_builder.range_lists.count(target_range) == 0) {
+            measure_builder.range_lists.insert({ target_range, measure_builder.create_range_list(target_range) });
+        }
+    }
+    for (const auto table_id : profile.table_ids) {
+        measure_builder.get_table(table_id);
+    }
+
+    auto before = measure_builder.snapshot_block_sizes();
+    size_t rom_before = measure_builder.rom_ram_logic.rom_arrays.size();
+    size_t ram_before = measure_builder.rom_ram_logic.ram_arrays.size();
+    handler(measure_builder, representative);
+    auto after = measure_builder.snapshot_block_sizes();
+    profile.block_sizes = Builder::delta(before, after);
+
+    // Extract ROM/RAM array counts per instance
+    profile.num_rom_arrays_per_instance = measure_builder.rom_ram_logic.rom_arrays.size() - rom_before;
+    profile.num_ram_arrays_per_instance = measure_builder.rom_ram_logic.ram_arrays.size() - ram_before;
+    for (size_t i = rom_before; i < measure_builder.rom_ram_logic.rom_arrays.size(); i++) {
+        profile.rom_array_sizes.push_back(measure_builder.rom_ram_logic.rom_arrays[i].state.size());
+    }
+    for (size_t i = ram_before; i < measure_builder.rom_ram_logic.ram_arrays.size(); i++) {
+        profile.ram_array_sizes.push_back(measure_builder.rom_ram_logic.ram_arrays[i].state.size());
+    }
+
+    return profile;
+}
+
+/**
+ * @brief Prepare a builder's caches from constraint profiles WITHOUT running any constraints.
+ * @details Populates the builder's constant cache, range lists, and lookup tables using data
+ * extracted from profiles. After this, all parallel constraint execution will find everything
+ * cached — no cache misses, no one-time setup costs.
+ */
+template <typename Builder>
+void prepare_builder_from_profiles(Builder& builder, const std::vector<ConstraintProfile<Builder>>& profiles)
+{
+    // Register all constants from all profiles
+    for (const auto& profile : profiles) {
+        for (const auto& value : profile.constants) {
+            builder.put_constant_variable(value);
+        }
+    }
+
+    // Create all needed range lists
+    for (const auto& profile : profiles) {
+        for (const auto target_range : profile.range_list_targets) {
+            if (builder.range_lists.count(target_range) == 0) {
+                builder.range_lists.insert({ target_range, builder.create_range_list(target_range) });
+            }
+        }
+    }
+
+    // Note: lookup tables are NOT created here. They are created in task order in Phase 2b
+    // so that table indices match sequential constraint processing order.
+}
+
+template <typename Builder>
+void build_constraints_parallel(Builder& builder,
+                                AcirFormat& constraints,
+                                const ProgramMetadata& metadata,
+                                size_t num_threads)
+{
+    using TaskBlockSizes = typename Builder::TaskBlockSizes;
+    size_t num_witnesses = constraints.max_witness_index + 1;
+
+    // Phase 1: Profile each constraint type to build a map from grouping key to profile.
+    // Each constraint type has a key function that determines which instances share the same
+    // gate count profile. We profile one representative per unique key.
+    //
+    // Phase 1b: Collect tasks in the SAME ORDER as sequential build_constraints processes them.
+    // This ensures that lookup tables, ROM arrays, and other ordering-dependent state are created
+    // in an order that matches sequential, making the circuits identical up to gate reordering.
+
+    // Use the UltraCircuitBuilder_ base type for task functions since execute_parallel is defined there.
+    // MegaCircuitBuilder inherits from UltraCircuitBuilder_<MegaTrace>, so this works for both.
+    using BaseBuilder = UltraCircuitBuilder_<typename Builder::ExecutionTrace>;
+
+    std::vector<ConstraintProfile<Builder>> profiles;
+    std::vector<std::function<void(BaseBuilder&)>> tasks;
+    std::vector<TaskBlockSizes> task_sizes;
+    std::vector<size_t> task_profile_indices;
+
+    // Helper: profile unique keys in a constraint vector, then add tasks in vector order.
+    // Combines profiling and task collection in a single call per constraint type.
+    auto profile_and_collect = [&](auto& items, auto handler, auto key_fn) {
+        if (items.empty()) {
+            return;
+        }
+        using Key = decltype(key_fn(items[0]));
+        std::map<Key, size_t> key_to_profile;
+        // Phase 1: profile unique keys
+        for (size_t i = 0; i < items.size(); i++) {
+            Key k = key_fn(items[i]);
+            if (key_to_profile.count(k) == 0) {
+                auto profile = profile_constraint_type<Builder>(items[i], handler, num_witnesses);
+                key_to_profile[k] = profiles.size();
+                profiles.push_back(profile);
+            }
+        }
+        // Phase 1b: add tasks in vector order
+        for (size_t i = 0; i < items.size(); i++) {
+            size_t profile_idx = key_to_profile.at(key_fn(items[i]));
+            const auto& profile = profiles[profile_idx];
+            auto sizes = profile.block_sizes;
+            sizes.num_rom_arrays = profile.num_rom_arrays_per_instance;
+            sizes.num_ram_arrays = profile.num_ram_arrays_per_instance;
+            tasks.emplace_back([handler, &items, i](BaseBuilder& b) { handler(static_cast<Builder&>(b), items[i]); });
+            task_sizes.push_back(sizes);
+            task_profile_indices.push_back(profile_idx);
+        }
+    };
+
+    // For constraint types with no grouping (fixed gate count), the key is a constant.
+    auto const_key = [](const auto&) -> int { return 0; };
+
+    // Define key functions for each grouped type
+    auto big_quad_key = [](const BigQuadConstraint& c) -> size_t { return c.size(); };
+    auto logic_key = [](const LogicConstraint& c) -> std::pair<uint32_t, bool> {
+        return { c.num_bits, c.is_xor_gate };
+    };
+    auto range_key = [](const RangeConstraint& c) -> uint32_t { return c.num_bits; };
+    auto aes_key = [](const AES128Constraint& c) -> size_t { return c.inputs.size(); };
+    auto blake2s_key = [](const Blake2sConstraint& c) -> size_t { return c.inputs.size(); };
+    auto blake3_key = [](const Blake3Constraint& c) -> size_t { return c.inputs.size(); };
+    auto pos2_key = [](const Poseidon2Constraint& c) -> size_t { return c.state.size(); };
+    auto msm_key = [](const MultiScalarMul& c) -> std::vector<bool> {
+        std::vector<bool> key;
+        key.reserve(c.points.size() + c.scalars.size());
+        for (const auto& p : c.points)
+            key.push_back(p.is_constant);
+        for (const auto& s : c.scalars)
+            key.push_back(s.is_constant);
+        return key;
+    };
+
+    // Define handlers
+    auto quad_handler = [](Builder& b, QuadConstraint& c) { create_quad_constraint(b, c); };
+    auto big_quad_handler = [](Builder& b, BigQuadConstraint& c) { create_big_quad_constraint(b, c); };
+    auto logic_handler = [](Builder& b, const LogicConstraint& c) {
+        create_logic_gate(b, c.a, c.b, c.result, c.num_bits, c.is_xor_gate);
+    };
+    auto range_handler = [](Builder& b, const RangeConstraint& c) {
+        b.create_dyadic_range_constraint(c.witness, c.num_bits, "parallel range constraint");
+    };
+    auto aes_handler = [](Builder& b, const AES128Constraint& c) { create_aes128_constraints(b, c); };
+    auto sha_handler = [](Builder& b, const Sha256Compression& c) { create_sha256_compression_constraints(b, c); };
+    auto ecdsa_k1_handler = [](Builder& b, const EcdsaConstraint& c) {
+        create_ecdsa_verify_constraints<stdlib::secp256k1<Builder>>(b, c);
+    };
+    auto ecdsa_r1_handler = [](Builder& b, const EcdsaConstraint& c) {
+        create_ecdsa_verify_constraints<stdlib::secp256r1<Builder>>(b, c);
+    };
+    auto blake2s_handler = [](Builder& b, const Blake2sConstraint& c) { create_blake2s_constraints(b, c); };
+    auto blake3_handler = [](Builder& b, const Blake3Constraint& c) { create_blake3_constraints(b, c); };
+    auto keccak_handler = [](Builder& b, const Keccakf1600& c) { create_keccak_permutations_constraints(b, c); };
+    auto pos2_handler = [](Builder& b, const Poseidon2Constraint& c) {
+        create_poseidon2_permutations_constraints(b, c);
+    };
+    auto msm_handler = [](Builder& b, const MultiScalarMul& c) { create_multi_scalar_mul_constraint(b, c); };
+    auto ec_add_handler = [](Builder& b, const EcAdd& c) { create_ec_add_constraint(b, c); };
+
+    // Profile and collect tasks in the same order as sequential build_constraints.
+    // Each call profiles unique keys, then adds tasks in constraint vector order.
+    profile_and_collect(constraints.quad_constraints, quad_handler, const_key);
+    profile_and_collect(constraints.big_quad_constraints, big_quad_handler, big_quad_key);
+    profile_and_collect(constraints.logic_constraints, logic_handler, logic_key);
+    profile_and_collect(constraints.range_constraints, range_handler, range_key);
+    profile_and_collect(constraints.aes128_constraints, aes_handler, aes_key);
+    profile_and_collect(constraints.sha256_compression, sha_handler, const_key);
+    profile_and_collect(constraints.ecdsa_k1_constraints, ecdsa_k1_handler, const_key);
+    profile_and_collect(constraints.ecdsa_r1_constraints, ecdsa_r1_handler, const_key);
+    profile_and_collect(constraints.blake2s_constraints, blake2s_handler, blake2s_key);
+    profile_and_collect(constraints.blake3_constraints, blake3_handler, blake3_key);
+    profile_and_collect(constraints.keccak_permutations, keccak_handler, const_key);
+    profile_and_collect(constraints.poseidon2_constraints, pos2_handler, pos2_key);
+    profile_and_collect(constraints.multi_scalar_mul_constraints, msm_handler, msm_key);
+    profile_and_collect(constraints.ec_add_constraints, ec_add_handler, const_key);
+
+    // Recursion constraints are parallelized like other constraint types, but each task also
+    // captures a HonkRecursionConstraintOutput for post-join merging (needed for pairing point
+    // propagation and IPA finalization).
+    struct RecursionTaskInfo {
+        HonkRecursionConstraintOutput<Builder> output;
+        bool update_ipa_data = false;
+        bool is_root_rollup = false;
+    };
+    size_t num_rec_tasks = constraints.honk_recursion_constraints.size() +
+                           constraints.chonk_recursion_constraints.size() +
+                           constraints.avm_recursion_constraints.size();
+    std::vector<RecursionTaskInfo> recursion_task_outputs(num_rec_tasks);
+    size_t rec_out_idx = 0;
+
+    // Helper: execute a single honk recursion constraint based on proof_type
+    auto execute_honk_recursion = [](Builder& b,
+                                     const RecursionConstraint& c) -> HonkRecursionConstraintOutput<Builder> {
+        if (c.proof_type == HONK_ZK) {
+            return create_honk_recursion_constraints<UltraZKRecursiveFlavor_<Builder>,
+                                                     stdlib::recursion::honk::DefaultIO<Builder>>(b, c);
+        } else if (c.proof_type == HONK) {
+            return create_honk_recursion_constraints<UltraRecursiveFlavor_<Builder>,
+                                                     stdlib::recursion::honk::DefaultIO<Builder>>(b, c);
+        } else {
+            // Rollup IO is only supported on UltraCircuitBuilder
+            if constexpr (std::is_same_v<Builder, UltraCircuitBuilder>) {
+                return create_honk_recursion_constraints<UltraRecursiveFlavor_<Builder>,
+                                                         stdlib::recursion::honk::RollupIO>(b, c);
+            } else {
+                bb::assert_failure("Rollup Honk proof type not supported on MegaBuilder");
+                return {};
+            }
+        }
+    };
+
+    // Profiling handler (discards output — only used for measuring gate counts)
+    auto honk_rec_handler = [&execute_honk_recursion](Builder& b, const RecursionConstraint& c) {
+        execute_honk_recursion(b, c);
+    };
+    auto honk_rec_key = [](const RecursionConstraint& c) -> uint32_t { return c.proof_type; };
+
+    // Profile honk recursion constraints by proof_type
+    std::map<uint32_t, size_t> honk_rec_profiles;
+    for (size_t i = 0; i < constraints.honk_recursion_constraints.size(); i++) {
+        uint32_t k = honk_rec_key(constraints.honk_recursion_constraints[i]);
+        if (honk_rec_profiles.count(k) == 0) {
+            auto profile = profile_constraint_type<Builder>(
+                constraints.honk_recursion_constraints[i], honk_rec_handler, num_witnesses);
+            honk_rec_profiles[k] = profiles.size();
+            profiles.push_back(profile);
+        }
+    }
+    // Add honk recursion tasks in vector order with output capture
+    for (size_t i = 0; i < constraints.honk_recursion_constraints.size(); i++) {
+        const auto& c = constraints.honk_recursion_constraints[i];
+        size_t profile_idx = honk_rec_profiles.at(c.proof_type);
+        const auto& profile = profiles[profile_idx];
+        auto sizes = profile.block_sizes;
+        sizes.num_rom_arrays = profile.num_rom_arrays_per_instance;
+        sizes.num_ram_arrays = profile.num_ram_arrays_per_instance;
+
+        size_t out_idx = rec_out_idx++;
+        recursion_task_outputs[out_idx].update_ipa_data =
+            (c.proof_type == ROLLUP_HONK || c.proof_type == ROOT_ROLLUP_HONK);
+        recursion_task_outputs[out_idx].is_root_rollup = (c.proof_type == ROOT_ROLLUP_HONK);
+
+        tasks.emplace_back(
+            [&constraints, i, &execute_honk_recursion, &recursion_task_outputs, out_idx](BaseBuilder& b) {
+                recursion_task_outputs[out_idx].output =
+                    execute_honk_recursion(static_cast<Builder&>(b), constraints.honk_recursion_constraints[i]);
+            });
+        task_sizes.push_back(sizes);
+        task_profile_indices.push_back(profile_idx);
+    }
+
+    // TODO: Chonk and AVM recursion constraints — same pattern as honk above.
+    // For now they fall through to Phase 4 sequential processing if present.
+
+    // Phase 2: Prepare the builder's caches from profiles (no constraint execution).
+    prepare_builder_from_profiles(builder, profiles);
+
+    // Phase 2b: Pre-create lookup tables and ROM/RAM arrays in task order (matching sequential
+    // constraint processing order). This ensures table indices and ROM IDs are deterministic
+    // and match what sequential build_constraints would produce.
+    for (size_t t = 0; t < tasks.size(); t++) {
+        const auto& profile = profiles[task_profile_indices[t]];
+        for (const auto table_id : profile.table_ids) {
+            builder.get_table(table_id); // no-op if already created
+        }
+        for (size_t r = 0; r < profile.num_rom_arrays_per_instance; r++) {
+            builder.rom_ram_logic.create_ROM_array(profile.rom_array_sizes[r]);
+        }
+        for (size_t r = 0; r < profile.num_ram_arrays_per_instance; r++) {
+            builder.rom_ram_logic.create_RAM_array(profile.ram_array_sizes[r]);
+        }
+    }
+
+    // Phase 3: Execute ALL instances in parallel (including recursion constraints)
+    if (!tasks.empty()) {
+        builder.execute_parallel(tasks, task_sizes, num_threads);
+    }
+
+    // Phase 4: Block constraints (sequential — these reference variables from earlier constraints).
+    for (const auto& [constraint, opcode_indices] :
+         zip_view(constraints.block_constraints, constraints.original_opcode_indices.block_constraints)) {
+        create_block_constraints(builder, constraint);
+    }
+
+    // Phase 4b: Merge recursion outputs from parallel tasks and process remaining sequential recursion.
+    {
+        HonkRecursionConstraintsOutput<Builder> output;
+
+        // Merge outputs from honk recursion tasks that ran in Phase 3
+        for (size_t i = 0; i < rec_out_idx; i++) {
+            const auto& rec = recursion_task_outputs[i];
+            output.update(rec.output, rec.update_ipa_data);
+            if (rec.is_root_rollup) {
+                output.is_root_rollup = true;
+            }
+        }
+
+        // Chonk and AVM recursion constraints — Ultra only, sequential for now (TODO: parallelize)
+        if constexpr (std::is_same_v<Builder, UltraCircuitBuilder>) {
+            for (const auto& constraint : constraints.chonk_recursion_constraints) {
+                auto honk_output = create_chonk_recursion_constraints(builder, constraint);
+                output.update(honk_output, /*update_ipa_data=*/true);
+            }
+            for (const auto& constraint : constraints.avm_recursion_constraints) {
+                auto honk_output = create_avm2_recursion_constraints_goblin(builder, constraint);
+                output.update(honk_output, /*update_ipa_data=*/true);
+            }
+        }
+
+        // HyperNova recursion constraints (Mega only, requires IVC state — always sequential)
+        const bool is_hn_recursion_constraints = !constraints.hn_recursion_constraints.empty();
+        if (is_hn_recursion_constraints) {
+            GateCounter gate_counter{ &builder, false };
+            std::vector<size_t> dummy_gates_per_opcode;
+            auto hn_output = create_recursion_constraints<Builder>(
+                builder,
+                gate_counter,
+                dummy_gates_per_opcode,
+                metadata.ivc,
+                { {}, {} },
+                { {}, {} },
+                { constraints.hn_recursion_constraints, constraints.original_opcode_indices.hn_recursion_constraints },
+                { {}, {} });
+            output.update(hn_output, /*update_ipa_data=*/false);
+        }
+
+        output.finalize(builder, is_hn_recursion_constraints, metadata.has_ipa_claim);
+    }
+}
+
+template void build_constraints_parallel<UltraCircuitBuilder>(UltraCircuitBuilder&,
+                                                              AcirFormat&,
+                                                              const ProgramMetadata&,
+                                                              size_t);
+template void build_constraints_parallel<MegaCircuitBuilder>(MegaCircuitBuilder&,
+                                                             AcirFormat&,
+                                                             const ProgramMetadata&,
+                                                             size_t);
+
 } // namespace acir_format
diff --git a/barretenberg/cpp/src/barretenberg/dsl/acir_format/acir_format.hpp b/barretenberg/cpp/src/barretenberg/dsl/acir_format/acir_format.hpp
index b00b86881573..2c36c61f780f 100644
--- a/barretenberg/cpp/src/barretenberg/dsl/acir_format/acir_format.hpp
+++ b/barretenberg/cpp/src/barretenberg/dsl/acir_format/acir_format.hpp
@@ -162,4 +162,22 @@ Builder create_circuit(AcirProgram& program, const ProgramMetadata& metadata = P
 template <typename Builder>
 void build_constraints(Builder& builder, AcirFormat& constraints, const ProgramMetadata& metadata);
 
+/**
+ * @brief Parallel variant of build_constraints for UltraCircuitBuilder.
+ * @details Processes each constraint type's instances in parallel using execute_parallel.
+ * For each type with N instances: runs 1 warmup instance sequentially, measures per-instance
+ * block sizes, then processes remaining N-1 instances across num_threads threads.
+ * Produces a bit-identical circuit to build_constraints.
+ *
+ * @param builder Must be constructed with the witness and public inputs already set
+ * @param constraints The ACIR constraints to process
+ * @param metadata Program metadata
+ * @param num_threads Number of threads for parallel execution
+ */
+template <typename Builder>
+void build_constraints_parallel(Builder& builder,
+                                AcirFormat& constraints,
+                                const ProgramMetadata& metadata,
+                                size_t num_threads);
+
 } // namespace acir_format
diff --git a/barretenberg/cpp/src/barretenberg/dsl/acir_format/parallel_circuit_construction_poc.md b/barretenberg/cpp/src/barretenberg/dsl/acir_format/parallel_circuit_construction_poc.md
new file mode 100644
index 000000000000..25df3bed0455
--- /dev/null
+++ b/barretenberg/cpp/src/barretenberg/dsl/acir_format/parallel_circuit_construction_poc.md
@@ -0,0 +1,372 @@
+# Parallel ACIR Circuit Construction
+
+## Status: PoC complete, ready for production PR
+
+The core parallel execution mechanism is proven and tested. The originally-planned precursor
+refactor (separating setup gates from constraint gates in sequential `build_constraints`) turned
+out to be unnecessary — see "Key insight" below.
+
+## Key insight: no precursor PR needed
+
+The original plan called for a two-PR approach: first refactor sequential `build_constraints` to
+pre-create setup state, then add the parallel path. The concern was that sequential and parallel
+paths would produce different circuits because setup gates land in different positions.
+
+**The realization:** `build_constraints_parallel` with N=1 threads already produces bit-identical
+circuits to N=2 threads. Both paths go through `prepare_builder_from_profiles` (which pre-creates
+constants, range lists, and lookup tables), so setup gates land in the same position regardless of
+thread count. Sequential execution is just the N=1 special case of parallel execution.
+
+**Validated:** `BuildConstraintsParallelN1vsN2` test passes — full wire-by-wire, selector-by-selector,
+variable, and union-find comparison between 1-thread and 2-thread parallel construction. Zero
+mismatches.
+
+This means we can ship the parallel infrastructure in a single PR:
+1. Wire `build_constraints_parallel` into `create_circuit` for `UltraCircuitBuilder`
+2. The old sequential `build_constraints` remains for other builder types (Mega)
+3. Update VKs (they change because setup gates move to the beginning)
+4. All existing tests pass — the parallel path with any thread count is a drop-in replacement
+
+**Note on scope:** The builder changes (cursors, deferred buffers, `execute_parallel`) only affect
+the ACIR construction path. Direct C++ circuit construction is unchanged — cursor mode is opt-in,
+entered only through `execute_parallel`. The lazy-init behavior of `put_constant_variable`,
+`create_range_list`, and `get_table` is preserved for all non-ACIR usage.
+
+## CRITICAL INVARIANT: Bit-identical circuits for any N
+
+**Circuits produced by `build_constraints_parallel` MUST be bit-identical regardless of thread
+count.** N=1, N=2, N=32 must all produce the exact same circuit — same wires, same selectors,
+same variable indices, same union-find. This is non-negotiable because:
+
+- Different circuits produce different VKs
+- VKs are hardcoded in the protocol (Aztec L1 contracts)
+- If the circuit depends on thread count, different machines with different core counts would
+  produce incompatible proofs
+- The verifier must be able to verify proofs from any prover regardless of hardware
+
+**This means:** Any mechanism that could produce different gate counts or variable indices based
+on thread assignment is a bug. This includes:
+- Gate fusion that depends on task-to-thread assignment
+- Shared mutable state accessed in nondeterministic order
+- Any code path that reads `block.size()` in cursor mode (returns pre-allocated total, not cursor)
+
+## Next step: production PR
+
+**Key files:**
+- `barretenberg/cpp/src/barretenberg/dsl/acir_format/acir_format.cpp` — wire `build_constraints_parallel`
+  into `create_circuit<UltraCircuitBuilder>`
+- `barretenberg/cpp/src/barretenberg/dsl/acir_format/acir_format.hpp` — declarations
+- Builder files (already modified on PoC branch): `execution_trace_block.hpp`, `circuit_builder_base.hpp`,
+  `circuit_builder_base_impl.hpp`, `ultra_circuit_builder.hpp`, `ultra_circuit_builder.cpp`
+
+**Verification:**
+- All existing tests must pass (no behavioral change for constraint correctness)
+- `CircuitChecker::check()` must pass on all circuits
+- `BuildConstraintsParallelN1vsN2` validates bit-identical circuits across thread counts
+- VKs WILL change — run `barretenberg/cpp/scripts/test_chonk_standalone_vks_havent_changed.sh --update_inputs`
+  to pin new VKs after verifying correctness
+
+## Motivation
+
+Circuit construction from ACIR is entirely single-threaded — the biggest sequential bottleneck as
+core count increases and as GPU-accelerated proving makes parallel work (MSM, sumcheck) cheaper.
+
+Measured on real transaction (`ecdsar1+transfer_1_recursions+sponsored_fpc`, 11 circuits):
+
+| Cores | Total prove time | create_circuit | ProverInstance | Sequential circuit % |
+|-------|------------------|----------------|----------------|---------------------|
+| 1     | ~25s             | 844ms          | 815ms          | ~6.6%               |
+| 8     | ~5.4s            | 709ms          | 497ms          | ~22%                |
+| 32    | 5.58s            | 702ms (12.6%)  | 528ms (9.5%)   | **22%**             |
+
+With GPU proving, sequential circuit construction could become 50%+ of total time.
+
+## What the PoC proved
+
+### Core mechanism: cursor-based parallel writes
+
+Threads write to pre-allocated regions of a shared builder using per-thread cursors. No builder
+duplication, no merge, no wire index remapping. The existing stdlib code (SHA256, Poseidon2, etc.)
+is completely unmodified — it calls the same `populate_wires`, `emplace_back`, `add_variable` APIs,
+which internally route through cursors when in parallel mode.
+
+**Tested with:** Two `std::thread`s running SHA256 + Poseidon2 concurrently on a shared builder.
+Zero mismatches across all blocks, selectors, variables, and union-find. 500/500 stress test.
+
+### Copy constraints (assert_equal) are safe under concurrency
+
+Each opcode's `assert_equal` calls only touch its own internal variables and its own unique output
+witnesses. The union-find modifications operate on disjoint variable sets across threads. No
+deferral needed.
+
+**Key invariant:** ACIR output witnesses are unique per opcode. No two opcodes call `assert_equal`
+on the same ACIR witness. The stdlib pattern is always: read inputs → compute → assert outputs
+equal witnesses. The "ripple" from `update_real_variable_indices` stays contained within one opcode.
+
+**Tested with:** Chained SHA256 opcodes where A's output witnesses are B's input witnesses, running
+on real concurrent threads. Union-find bit-identical to sequential (verified all 43,704 variables).
+
+### Range constraints and lookup gates: deferred per-thread, replayed after join
+
+These are the only operations that append to shared collections (`range_lists[target].variable_indices`
+and `table.lookup_gates`). Both are order-independent:
+- `variable_indices` gets sorted and deduplicated in `process_range_list`
+- `lookup_gates` just counts occurrences in `construct_lookup_read_counts`
+
+Per-thread buffers, concatenated in deterministic thread order after join.
+
+**Tested with:** 5,792 deferred lookup entries + 458 deferred range constraints from chained SHA256,
+all replayed correctly. Finalized circuits pass CircuitChecker.
+
+### Gate construction is deterministic across threads
+
+After finalization, every block (arithmetic, lookup, delta_range, elliptic, poseidon2_external,
+poseidon2_internal, pub_inputs) is bit-identical between sequential and parallel construction —
+verified wire-by-wire and selector-by-selector. Variable counts match. Union-find matches.
+
+### `put_constant_variable` is safe with read-only cache after warmup
+
+All constants for tested opcodes (SHA256, Poseidon2) are fully covered by one warmup instance.
+In cursor mode, the cache is read-only — lookups are safe for concurrent reads (no writers).
+Zero cache misses observed during parallel phase.
+
+### `execute_parallel` orchestrator (production-ready)
+
+Lives on the builder (`UltraCircuitBuilder::execute_parallel`). Takes a vector of task lambdas
+and pre-computed per-task sizes. Handles:
+- Pre-allocation of all blocks and variables
+- Per-thread cursor setup (on main thread, avoiding resize races)
+- Thread dispatch with `set_parallel_thread_index`
+- Deferred operation replay after join
+- 500/500 stress test, zero race conditions
+
+### Profile-based planning (simulates eventual table lookup)
+
+`profile_constraint_type` runs a constraint on a throwaway builder and extracts:
+- `TaskBlockSizes` (per-block gate counts + variable count)
+- Constants to pre-register
+- Range list targets to pre-create
+- Lookup table IDs to pre-create
+
+`prepare_builder_from_profiles` populates the real builder's caches from this data without
+executing any constraints. This is the interface the table lookup will eventually implement.
+
+## Real circuit analysis
+
+### Aztec transaction opcode breakdown (ecdsar1+transfer_0_recursions+sponsored_fpc)
+
+| Circuit | Opcodes | Gates | Key constraint types |
+|---------|--------:|------:|---------------------|
+| EcdsaRAccount:entrypoint | 8,938 | 78,062 | 2000 quad, 468 range, 1 sha256, 29 pos2, 1 ecdsa_r1 |
+| private_kernel_init | 8,913 | 44,239 | 6253 quad, 1218 range, 69 pos2, 1 msm |
+| private_kernel_inner | 19,697 | 95,347 | 10724 quad, 3683 big_quad, 2658 range, 69 pos2 |
+| Token:transfer | 22,600 | 79,563 | 11675 quad, 3752 range, 57 pos2, 8 msm, 6 aes128 |
+| private_kernel_reset | 29,586 | 102,252 | 16018 quad, 3403 big_quad, 4447 range, 375 pos2 |
+| private_kernel_tail | 9,096 | 43,186 | 6634 quad, 1402 range, 11 pos2 |
+| hiding_kernel | 1,502 | 36,180 | 1413 quad, 80 range, 7 pos2 |
+
+### Per-block gate breakdown for private_kernel_inner (95,347 gates)
+
+| Block | Gates | % |
+|-------|------:|--:|
+| arithmetic | 40,636 | 42.6% |
+| poseidon2_internal | 39,216 | 41.1% |
+| poseidon2_external | 6,880 | 7.2% |
+| elliptic | 336 | 0.4% |
+
+No single opcode dominates. 69 Poseidon2 instances produce 48% of gates. With table lookup
+(no warmup), all ~19,600 constraints distribute across threads with near-linear speedup.
+
+## Builder changes implemented
+
+### execution_trace_block.hpp
+- Per-thread cursor arrays (`std::vector<size_t> cursors_`) on `Selector` and `ExecutionTraceBlock`
+- `thread_local parallel_thread_idx` for routing operations to correct cursor
+- `enable_cursor_mode(thread_idx, start)` / `disable_cursor_mode(thread_idx)`
+- `populate_wires` and selector writes route through `active_cursor()` / `active_cursor_ref()`
+- `last_gate_index()` / `next_gate_index()` for cursor-aware gate position queries
+- `wire_cursor_start()` for tracking task boundary (prevents cross-task gate fusion)
+
+### circuit_builder_base.hpp
+- Per-thread variable cursor array (`std::vector<uint32_t> variable_cursors_`)
+- `enable_variable_cursor(thread_idx, start)` / `disable_variable_cursor(thread_idx)`
+- `resize_variables(total_size)` for pre-allocation
+- `get_variable_cursor()` routes through `parallel_thread_idx`
+- `increment_num_gates` skipped in cursor mode
+- `get_next_var_index()` / `get_prev_var_index()` const accessors
+
+### ultra_circuit_builder.hpp
+- `TaskBlockSizes` struct + `snapshot_block_sizes()` / `delta()`
+- `execute_parallel()` orchestrator
+- Per-thread deferred buffers for lookup gates and range constraints
+- `init_deferred_buffers()` / `apply_deferred_lookup_gates()` / `apply_deferred_range_constraints()`
+- `update_used_witnesses` / `update_finalize_witnesses` skipped in cursor mode
+
+### ultra_circuit_builder.cpp
+- `put_constant_variable`: read-only cache bypass in cursor mode
+- `create_small_range_constraint`: deferral in cursor mode
+- `create_gates_from_plookup_accumulators`: lookup gate deferral in cursor mode
+- `create_ecc_add_gate` / `create_ecc_dbl_gate`: cursor-aware gate fusion (uses cursor position
+  instead of `block.size()` to find previous gate; fusion disabled at task boundaries)
+
+### rom_ram_logic.hpp / rom_ram_logic.cpp
+- Per-thread ROM/RAM ID cursors for pre-allocated array assignment
+- `create_ROM_array` / `create_RAM_array`: cursor-mode returns pre-assigned IDs
+- `gate_index` recording uses `last_gate_index()` / `next_gate_index()` instead of `block.size()`
+
+### acir_format.cpp
+- `profile_constraint_type()`: throwaway builder measurement (separate pre-warmed builder to
+  avoid cross-instance gate fusion in profiling)
+- `prepare_builder_from_profiles()`: cache population from profiles
+- `build_constraints_parallel()`: full parallel orchestration
+- Constraint type grouping by gate-count-affecting parameters (range by num_bits, big_quad by
+  size(), logic by (num_bits, is_xor_gate), aes128/blake2s/blake3 by inputs.size(), poseidon2
+  by state.size(), multi_scalar_mul by points.size())
+
+## Shared state audit
+
+| State | Category | Solution | Verified |
+|-------|----------|----------|----------|
+| Block gate writes | Partitionable | Per-thread cursors | Yes (500/500) |
+| `add_variable` | Partitionable | Per-thread variable cursors | Yes |
+| `assert_equal` / union-find | Naturally disjoint | No change needed | Yes (43k vars) |
+| `put_constant_variable` | Read-only after warmup | Cache bypass in cursor mode | Yes (0 misses) |
+| Range list creation | One-time init | Pre-created from profiles | Yes |
+| Plookup table creation | One-time init | Pre-created from profiles | Yes |
+| `create_small_range_constraint` | Deferred | Per-thread buffer, replay | Yes (458 entries) |
+| `table.lookup_gates` append | Deferred | Per-thread buffer, replay | Yes (5792 entries) |
+| `update_used_witnesses` | Skip in cursor mode | Boomerang detection only | Yes |
+| `update_finalize_witnesses` | Skip in cursor mode | Finalize detection only | Yes |
+| `increment_num_gates` | Skip in cursor mode | Pre-computed total | Yes |
+| ROM/RAM array creation | Pre-allocated | Per-thread ID cursors | Yes |
+| `memory_read/write_records` | Gate index recording | Uses `last_gate_index()` | Yes |
+| ECC gate fusion | Cursor-aware | Uses cursor position, not block.size() | In progress |
+
+## The remaining blocker: setup gate ordering
+
+### The problem
+
+In sequential `build_constraints`, the first constraint of each type triggers one-time setup:
+- Range list staircase creation (`create_range_list` → arithmetic gates + variables)
+- Lookup table initialization (`get_table` → populates `lookup_tables`)
+- Constant registration (`put_constant_variable` → `fix_witness` → arithmetic gate)
+
+These setup gates are interleaved with the first constraint's own gates. Their position in the
+circuit affects the VK.
+
+In parallel mode, `prepare_builder_from_profiles` creates setup gates separately before any
+constraints run. The setup gates land at different positions → different circuit → different VK.
+
+Both circuits are valid (both pass CircuitChecker), but they are NOT identical.
+
+### The solution: precursor refactor
+
+Change the sequential `build_constraints` path to separate setup from execution. This is a
+standalone change with no parallel code — just reordering when setup gates are created.
+
+### What currently happens (implicit setup)
+
+When `build_constraints` processes constraints sequentially, the first constraint of each type
+triggers lazy initialization:
+
+1. **`put_constant_variable(value)`** — if the value isn't cached, creates a new variable +
+   `fix_witness` gate (1 arithmetic gate). Every subsequent call with the same value returns the
+   cached index. SHA256 creates ~900 unique constants on its first invocation; the 2nd+ SHA256
+   finds them all cached. These `fix_witness` gates are interleaved with the first constraint's
+   own gates in the arithmetic block.
+
+2. **`create_range_list(target_range)`** — called lazily from `create_small_range_constraint` when
+   a range target hasn't been seen before. Creates a "staircase" of sorted padding variables +
+   unconstrained arithmetic gates (e.g., SHA256 triggers 5 range lists costing 1371 arithmetic
+   gates). These gates are interleaved with the first constraint that triggers each range.
+
+3. **`get_table(table_id)`** — called lazily from `create_gates_from_plookup_accumulators` when a
+   lookup table hasn't been created yet. Appends to `lookup_tables`. No gates are created, but the
+   table must exist before any plookup reads reference it.
+
+### What needs to change (explicit setup)
+
+Add a setup phase at the beginning of `build_constraints` that pre-creates all setup state before
+any constraint processing. This requires knowing which constraint types are present in the program.
+
+**Concrete changes to `build_constraints` in `acir_format.cpp`:**
+
+1. **Pre-register constants.** Before the constraint loops, call `put_constant_variable(v)` for
+   every constant value that any constraint type will need. The set of constants per constraint
+   type is deterministic (SHA256 always needs the same ~900 constants, Poseidon2 needs ~5, etc.).
+   Source: `ConstraintProfile::constants` from `profile_constraint_type`, or eventually a stored
+   table.
+
+2. **Pre-create range lists.** Before the constraint loops, call `create_range_list(target)` for
+   every range target that any constraint type will need. Source:
+   `ConstraintProfile::range_list_targets`, or eventually a stored table. SHA256 needs targets
+   {1, 3, 7, 15, 16383}. Most opcodes need only {16383} (DEFAULT_PLOOKUP_RANGE_SIZE).
+
+3. **Pre-create lookup tables.** Before the constraint loops, call `get_table(id)` for every
+   BasicTableId that any constraint type will need. Source: `ConstraintProfile::table_ids`, or
+   eventually a stored table. SHA256 needs SHA256 lookup tables; logic constraints need XOR/AND
+   tables; etc.
+
+**The key invariant:** After the setup phase, no constraint execution triggers `put_constant_variable`
+cache misses, `create_range_list` calls, or new `get_table` calls. Every constraint — whether it's
+the 1st or 100th of its type — produces identical gates.
+
+**Where the setup data comes from (now vs later):**
+- **Now (PoC):** `profile_constraint_type` runs constraints on a throwaway builder and extracts
+  the constants/ranges/tables. This is slow (2x work per type) but correct.
+- **Later (production):** A stored table keyed by `(constraint_type, parameters)` provides the
+  same data as a compile-time lookup. The table is generated once and validated by pinning tests.
+
+### Effect on the circuit
+
+The setup gates (fix_witness for constants, range list staircases) move from being interleaved
+with the first constraint of each type to being grouped at the beginning of the circuit. This
+changes:
+- Gate ordering within the arithmetic block
+- Variable indices (constants get earlier indices)
+- The VK (different gate positions → different polynomials)
+
+It does NOT change:
+- The set of constraints (same gates, just reordered)
+- The satisfying witness assignment
+- Circuit correctness (both old and new pass CircuitChecker and prove/verify)
+
+### Implementation plan
+
+**Single PR:** Ship the parallel infrastructure and wire it into `create_circuit` for Ultra.
+
+1. Extract all PoC changes (builder + acir_format) into a clean branch off `merge-train/barretenberg`
+2. Wire `build_constraints_parallel` into `create_circuit<UltraCircuitBuilder>` (replacing
+   `build_constraints` for Ultra only; Mega and other builders continue using the sequential path)
+3. Enable `BuildConstraintsParallelN1vsN2` test to validate bit-identical circuits across thread counts
+4. Run full test suite (`dsl_tests`, `ultra_honk_tests`, `chonk_tests`)
+5. Update VKs via `test_chonk_standalone_vks_havent_changed.sh --update_inputs`
+6. Later: replace `profile_constraint_type` (throwaway builder) with stored lookup tables for
+   production performance
+
+## Files modified in PoC
+
+| File | Change |
+|------|--------|
+| `honk/execution_trace/execution_trace_block.hpp` | Per-thread cursors, thread-local index, `last_gate_index()`, `next_gate_index()`, `wire_cursor_start()` |
+| `stdlib_circuit_builders/circuit_builder_base.hpp` | Per-thread variable cursors, resize, accessors |
+| `stdlib_circuit_builders/circuit_builder_base_impl.hpp` | Cursor-aware add_variable |
+| `stdlib_circuit_builders/ultra_circuit_builder.hpp` | execute_parallel, deferred buffers, TaskBlockSizes, ROM/RAM cursor management |
+| `stdlib_circuit_builders/ultra_circuit_builder.cpp` | put_constant_variable bypass, deferral checks, cursor-aware ECC gate fusion |
+| `stdlib_circuit_builders/rom_ram_logic.hpp` | Per-thread ROM/RAM ID cursors |
+| `stdlib_circuit_builders/rom_ram_logic.cpp` | Cursor-mode ROM/RAM creation, cursor-aware gate_index recording |
+| `dsl/acir_format/acir_format.hpp` | build_constraints_parallel declaration |
+| `dsl/acir_format/acir_format.cpp` | profile_constraint_type, prepare_builder_from_profiles, build_constraints_parallel, constraint grouping |
+| `dsl/acir_format/per_block_gate_count.test.cpp` | All PoC tests |
+
+## Tests
+
+| Test | What it verifies |
+|------|-----------------|
+| `RealParallelChainedSha256` | Bit-identical circuit (full wire/selector/variable/union-find comparison) with real witness values, CircuitChecker on both, chained data dependencies, 5792 deferred lookups + 458 deferred ranges |
+| `BuildConstraintsParallelN1vsN2` | Real AcirProgram through `build_constraints_parallel` with 1 vs 2 threads, full wire/selector/variable/union-find comparison — validates that sequential is just the N=1 case of parallel |
+| `SequentialVsParallelSemanticEquivalence` | Sequential `build_constraints` vs `build_constraints_parallel` — same block sizes, variable counts, copy cycles, constants, range lists, lookup tables |
+| `AcirTestParallelEquivalence` | Parameterized over all acir_tests — 3-way comparison (sequential, N=1, N=2) with semantic equivalence and bit-identical checks |
+| `IsolatedVsSharedSelectorEquivalence` | Selector equivalence between isolated and shared warmed builders |
+| `WarmedAdditivityComprehensive` | Gate count additivity across 5 opcode types after warmup |
+| Individual opcode measurements | Per-block gate counts for Quad, SHA256, Poseidon2, EC Add, Logic XOR |
diff --git a/barretenberg/cpp/src/barretenberg/dsl/acir_format/per_block_gate_count.test.cpp b/barretenberg/cpp/src/barretenberg/dsl/acir_format/per_block_gate_count.test.cpp
new file mode 100644
index 000000000000..4c0a12281271
--- /dev/null
+++ b/barretenberg/cpp/src/barretenberg/dsl/acir_format/per_block_gate_count.test.cpp
@@ -0,0 +1,956 @@
+/**
+ * @file per_block_gate_count.test.cpp
+ * @brief Measures per-block gate counts for each ACIR opcode type, and tests whether they are additive across opcodes.
+ *
+ * @details This is a PoC investigating whether ACIR circuit construction can be parallelized via a "plan then execute"
+ * model. The key question: if we know the per-block gate count for each opcode, can we pre-compute a deterministic
+ * layout (prefix sum of per-block sizes), then execute opcodes in parallel into pre-allocated regions?
+ *
+ * Step 1: Measure per-block gate counts for individual opcodes.
+ * Step 2: Test additivity — does the sum of individual per-block counts match a combined circuit?
+ */
+
+#include <gtest/gtest.h>
+
+#include "acir_format.hpp"
+#include "acir_to_constraint_buf.hpp"
+#include "barretenberg/circuit_checker/circuit_checker.hpp"
+#include "barretenberg/common/get_bytecode.hpp"
+#include "barretenberg/crypto/poseidon2/poseidon2.hpp"
+#include "barretenberg/crypto/sha256/sha256.hpp"
+#include "barretenberg/dsl/acir_format/poseidon2_constraint.hpp"
+#include "barretenberg/dsl/acir_format/sha256_constraint.hpp"
+#include "barretenberg/dsl/acir_format/test_class.hpp"
+#include "barretenberg/dsl/acir_format/utils.hpp"
+#include "barretenberg/special_public_inputs/special_public_inputs.hpp"
+#include "barretenberg/stdlib_circuit_builders/mega_circuit_builder.hpp"
+#include "barretenberg/stdlib_circuit_builders/ultra_circuit_builder.hpp"
+#include "barretenberg/ultra_honk/prover_instance.hpp"
+#include "barretenberg/ultra_honk/ultra_prover.hpp"
+
+#include <filesystem>
+
+using namespace bb;
+using namespace acir_format;
+
+class PerBlockGateCountTests : public ::testing::Test {
+  protected:
+    static void SetUpTestSuite() { bb::srs::init_file_crs_factory(bb::srs::bb_crs_path()); }
+};
+
+// Helper to build the test program: 3 SHA256 + 3 Poseidon2
+AcirFormat build_sha256_poseidon2_test_program(WitnessVector& witness_out)
+{
+    std::vector<Acir::Opcode> all_opcodes;
+
+    // 3 SHA256 compression constraints, each using 32 witnesses
+    for (uint32_t i = 0; i < 3; i++) {
+        uint32_t base = i * 32;
+        Sha256Compression sha;
+        for (size_t j = 0; j < 16; ++j)
+            sha.inputs[j] = WitnessOrConstant<bb::fr>::from_index(base + static_cast<uint32_t>(j));
+        for (size_t j = 0; j < 8; ++j)
+            sha.hash_values[j] = WitnessOrConstant<bb::fr>::from_index(base + static_cast<uint32_t>(j));
+        for (size_t j = 0; j < 8; ++j)
+            sha.result[j] = base + static_cast<uint32_t>(j) + 24;
+        auto ops = constraint_to_acir_opcode(sha);
+        all_opcodes.insert(all_opcodes.end(), ops.begin(), ops.end());
+    }
+
+    // 3 Poseidon2 constraints, each using 8 witnesses, starting after SHA256 witnesses
+    for (uint32_t i = 0; i < 3; i++) {
+        uint32_t base = 96 + i * 8;
+        Poseidon2Constraint pos;
+        for (uint32_t j = 0; j < 4; j++) {
+            pos.state.emplace_back(WitnessOrConstant<bb::fr>::from_index(base + j));
+            pos.result.emplace_back(base + 4 + j);
+        }
+        auto ops = constraint_to_acir_opcode(pos);
+        all_opcodes.insert(all_opcodes.end(), ops.begin(), ops.end());
+    }
+
+    Acir::Circuit circuit = build_acir_circuit(all_opcodes);
+    witness_out = WitnessVector(120, fr(0));
+    return circuit_serde_to_acir_format(circuit);
+}
+
+// N=1 parallel vs N=2 parallel: should be bit-identical since both go through
+// prepare_builder_from_profiles and execute_parallel.
+TEST_F(PerBlockGateCountTests, ParallelN1vsN2BitIdentical)
+{
+    WitnessVector witness;
+    AcirFormat constraint_system = build_sha256_poseidon2_test_program(witness);
+
+    // Build with 1 thread
+    AcirFormat n1_constraints = constraint_system;
+    UltraCircuitBuilder n1_builder{ WitnessVector(witness), n1_constraints.public_inputs, false };
+    build_constraints_parallel(n1_builder, n1_constraints, ProgramMetadata{}, /*num_threads=*/1);
+
+    // Build with 2 threads
+    AcirFormat n2_constraints = constraint_system;
+    UltraCircuitBuilder n2_builder{ WitnessVector(witness), n2_constraints.public_inputs, false };
+    build_constraints_parallel(n2_builder, n2_constraints, ProgramMetadata{}, /*num_threads=*/2);
+
+    // Both must pass circuit checker
+    EXPECT_TRUE(CircuitChecker::check(n1_builder));
+    EXPECT_TRUE(CircuitChecker::check(n2_builder));
+
+    // Bit-identical: every block's wires and selectors must match
+    auto n1_blocks = n1_builder.blocks.get();
+    auto n2_blocks = n2_builder.blocks.get();
+    for (size_t b = 0; b < UltraCircuitBuilder::ExecutionTrace::NUM_BLOCKS; b++) {
+        EXPECT_EQ(n1_blocks[b].size(), n2_blocks[b].size()) << "block " << b << " size mismatch";
+        size_t count = std::min(n1_blocks[b].size(), n2_blocks[b].size());
+
+        size_t wire_mismatches = 0;
+        for (size_t w = 0; w < 4; w++) {
+            for (size_t i = 0; i < count; i++) {
+                if (n1_blocks[b].wires[w][i] != n2_blocks[b].wires[w][i])
+                    wire_mismatches++;
+            }
+        }
+        EXPECT_EQ(wire_mismatches, 0) << "block " << b << ": " << wire_mismatches << " wire mismatches";
+
+        auto n1_sels = n1_blocks[b].get_selectors();
+        auto n2_sels = n2_blocks[b].get_selectors();
+        size_t sel_mismatches = 0;
+        for (size_t s = 0; s < n1_sels.size(); s++) {
+            for (size_t i = 0; i < count; i++) {
+                if (n1_sels[s][i] != n2_sels[s][i])
+                    sel_mismatches++;
+            }
+        }
+        EXPECT_EQ(sel_mismatches, 0) << "block " << b << ": " << sel_mismatches << " selector mismatches";
+    }
+
+    // Variable counts and union-find must match exactly
+    EXPECT_EQ(n1_builder.get_num_variables(), n2_builder.get_num_variables());
+    size_t num_vars = std::min(n1_builder.get_num_variables(), n2_builder.get_num_variables());
+    size_t real_idx_mismatches = 0;
+    for (size_t i = 0; i < num_vars; i++) {
+        if (n1_builder.real_variable_index[i] != n2_builder.real_variable_index[i])
+            real_idx_mismatches++;
+    }
+    EXPECT_EQ(real_idx_mismatches, 0) << "real_variable_index mismatches";
+}
+
+// Helper: create a valid UltraHonk proof and convert it to a RecursionConstraint.
+// Returns the constraint and the witness vector containing proof/VK data.
+std::pair<RecursionConstraint, WitnessVector> create_honk_recursion_test_data()
+{
+    using InnerFlavor = UltraFlavor;
+    using InnerBuilder = UltraCircuitBuilder;
+    using InnerProverInstance = ProverInstance_<InnerFlavor>;
+    using InnerProver = UltraProver;
+    using InnerIO = stdlib::recursion::honk::DefaultIO<InnerBuilder>;
+
+    // Create a simple inner circuit: one mul gate + default public inputs
+    InnerBuilder inner_builder;
+    auto a = inner_builder.add_variable(fr::random_element());
+    auto b = inner_builder.add_variable(fr::random_element());
+    auto c = inner_builder.add_variable(inner_builder.get_variable(a) * inner_builder.get_variable(b));
+    inner_builder.create_big_mul_add_gate({ .a = a,
+                                            .b = b,
+                                            .c = c,
+                                            .d = inner_builder.zero_idx(),
+                                            .mul_scaling = 1,
+                                            .a_scaling = 0,
+                                            .b_scaling = 0,
+                                            .c_scaling = -1,
+                                            .d_scaling = 0,
+                                            .const_scaling = 0 });
+    InnerIO::add_default(inner_builder);
+
+    auto prover_instance = std::make_shared<InnerProverInstance>(inner_builder);
+    auto verification_key = std::make_shared<typename InnerFlavor::VerificationKey>(prover_instance->get_precomputed());
+    InnerProver prover(prover_instance, verification_key);
+    auto proof = prover.construct_proof();
+
+    WitnessVector witness;
+    RecursionConstraint constraint =
+        recursion_data_to_recursion_constraint(witness,
+                                               proof,
+                                               verification_key->to_field_elements(),
+                                               verification_key->hash(),
+                                               bb::fr::one(),
+                                               inner_builder.num_public_inputs() - InnerIO::PUBLIC_INPUTS_SIZE,
+                                               HONK);
+
+    return { constraint, witness };
+}
+
+// Forward declarations for functions defined later in this file
+size_t check_semantic_equivalence(const std::string& label, UltraCircuitBuilder& a, UltraCircuitBuilder& b);
+size_t check_bit_identical(const std::string& label, UltraCircuitBuilder& a, UltraCircuitBuilder& b);
+std::filesystem::path find_acir_tests_dir();
+
+// Test that a circuit with a HONK recursion constraint passes CircuitChecker
+// when built through the sequential and parallel paths.
+TEST_F(PerBlockGateCountTests, RecursionConstraintBasic)
+{
+    auto [recursion_constraint, witness] = create_honk_recursion_test_data();
+
+    AcirFormat constraints{};
+    constraints.honk_recursion_constraints = { recursion_constraint };
+    constraints.original_opcode_indices.honk_recursion_constraints = { 0 };
+    constraints.num_acir_opcodes = 1;
+    constraints.max_witness_index = static_cast<uint32_t>(witness.size() - 1);
+    ProgramMetadata metadata{};
+
+    // Fix predicate to constant true (matching production Noir circuits)
+    constraints.honk_recursion_constraints[0].predicate = WitnessOrConstant<bb::fr>::from_constant(bb::fr(1));
+
+    // Step 2: Use Mega for smaller circuits. Build parallel first, then sequential with same pre-warming.
+    // Mega parallel N=1
+    AcirFormat par_constraints = constraints;
+    MegaCircuitBuilder par_builder{
+        std::make_shared<ECCOpQueue>(), WitnessVector(witness), par_constraints.public_inputs, false
+    };
+    build_constraints_parallel(par_builder, par_constraints, metadata, /*num_threads=*/1);
+    info("  Mega Parallel N=1: vars=", par_builder.get_num_variables());
+
+    // Mega sequential with same constants pre-registered
+    AcirFormat seq_constraints = constraints;
+    MegaCircuitBuilder seq_builder{
+        std::make_shared<ECCOpQueue>(), WitnessVector(witness), seq_constraints.public_inputs, false
+    };
+    for (const auto& [val, _] : par_builder.constant_variable_indices) {
+        seq_builder.put_constant_variable(val);
+    }
+    for (const auto& [target, rl] : par_builder.range_lists) {
+        if (seq_builder.range_lists.count(target) == 0) {
+            seq_builder.range_lists.insert({ target, seq_builder.create_range_list(target) });
+        }
+    }
+    build_constraints(seq_builder, seq_constraints, metadata);
+    info("  Mega Sequential (pre-warmed): vars=", seq_builder.get_num_variables());
+
+    // Compare
+    EXPECT_EQ(par_builder.get_num_variables(), seq_builder.get_num_variables()) << "Variable count mismatch";
+    {
+        auto pb = par_builder.blocks.get();
+        auto sb = seq_builder.blocks.get();
+        for (size_t bl = 0; bl < MegaCircuitBuilder::ExecutionTrace::NUM_BLOCKS; bl++) {
+            EXPECT_EQ(pb[bl].size(), sb[bl].size()) << "Block " << bl << " size mismatch";
+        }
+    }
+    // Copy cycles
+    {
+        auto collect_cycles = [](auto& builder) {
+            std::map<uint32_t, size_t> root_sizes;
+            for (size_t i = 0; i < builder.get_num_variables(); i++) {
+                root_sizes[builder.real_variable_index[i]]++;
+            }
+            std::vector<std::pair<bb::fr, size_t>> cycles;
+            for (const auto& [root, sz] : root_sizes) {
+                cycles.emplace_back(builder.get_variable(root), sz);
+            }
+            std::sort(cycles.begin(), cycles.end(), [](const auto& x, const auto& y) {
+                return x.second != y.second ? x.second < y.second : x.first < y.first;
+            });
+            return cycles;
+        };
+        auto par_cycles = collect_cycles(par_builder);
+        auto seq_cycles = collect_cycles(seq_builder);
+        size_t cycle_mismatches = 0;
+        if (par_cycles.size() == seq_cycles.size()) {
+            for (size_t i = 0; i < par_cycles.size(); i++) {
+                if (par_cycles[i] != seq_cycles[i])
+                    cycle_mismatches++;
+            }
+        }
+        info("  Copy cycles: ", par_cycles.size(), " vs ", seq_cycles.size(), ", mismatches=", cycle_mismatches);
+    }
+    // Gate multiset for block 4 (arithmetic in Mega)
+    {
+        auto pb = par_builder.blocks.get();
+        auto sb = seq_builder.blocks.get();
+        size_t bl = 4; // arithmetic
+        if (pb[bl].size() == sb[bl].size() && pb[bl].size() > 0) {
+            size_t count = pb[bl].size();
+            auto ps = pb[bl].get_selectors();
+            auto ss = sb[bl].get_selectors();
+            size_t ts = 4 + ps.size();
+            auto ct = [&](const auto& blk, const auto& sels, auto& builder) {
+                std::vector<std::vector<bb::fr>> tuples;
+                tuples.reserve(count);
+                for (size_t i = 0; i < count; i++) {
+                    std::vector<bb::fr> t(ts);
+                    for (size_t w = 0; w < 4; w++)
+                        t[w] = builder.get_variable(blk.wires[w][i]);
+                    for (size_t s = 0; s < sels.size(); s++)
+                        t[4 + s] = sels[s][i];
+                    tuples.push_back(std::move(t));
+                }
+                std::sort(tuples.begin(), tuples.end());
+                return tuples;
+            };
+            auto pt = ct(pb[bl], ps, par_builder);
+            auto st = ct(sb[bl], ss, seq_builder);
+            info("  Block 4 (arithmetic) multiset: ", pt == st ? "MATCH" : "MISMATCH", " (", count, " gates)");
+            EXPECT_TRUE(pt == st) << "Gate multiset mismatch in block 4";
+        }
+    }
+
+    // CircuitChecker on both
+    EXPECT_TRUE(CircuitChecker::check(par_builder)) << "Parallel N=1 failed CircuitChecker";
+    EXPECT_TRUE(CircuitChecker::check(seq_builder)) << "Sequential failed CircuitChecker";
+
+    // N=1 vs N=2 bit-identical
+    {
+        AcirFormat par2_constraints = constraints;
+        MegaCircuitBuilder par2_builder{
+            std::make_shared<ECCOpQueue>(), WitnessVector(witness), par2_constraints.public_inputs, false
+        };
+        build_constraints_parallel(par2_builder, par2_constraints, metadata, /*num_threads=*/2);
+        info("  Mega Parallel N=2: vars=", par2_builder.get_num_variables());
+        EXPECT_EQ(par_builder.get_num_variables(), par2_builder.get_num_variables()) << "N=1 vs N=2 var count";
+
+        size_t n1_n2_diffs = 0;
+        for (size_t i = 0; i < par_builder.get_num_variables(); i++) {
+            if (par_builder.real_variable_index[i] != par2_builder.real_variable_index[i])
+                n1_n2_diffs++;
+        }
+        info("  N=1 vs N=2 real_variable_index diffs: ", n1_n2_diffs);
+        EXPECT_EQ(n1_n2_diffs, 0) << "N=1 vs N=2 not bit-identical";
+    }
+
+    // Quick Ultra check: does the same test fail with Ultra?
+    {
+        AcirFormat ultra_par_c = constraints;
+        UltraCircuitBuilder ultra_par{ WitnessVector(witness), ultra_par_c.public_inputs, false };
+        build_constraints_parallel(ultra_par, ultra_par_c, metadata, /*num_threads=*/1);
+
+        AcirFormat ultra_seq_c = constraints;
+        UltraCircuitBuilder ultra_seq{ WitnessVector(witness), ultra_seq_c.public_inputs, false };
+        for (const auto& [val, _] : ultra_par.constant_variable_indices) {
+            ultra_seq.put_constant_variable(val);
+        }
+        for (const auto& [target, rl] : ultra_par.range_lists) {
+            if (ultra_seq.range_lists.count(target) == 0) {
+                ultra_seq.range_lists.insert({ target, ultra_seq.create_range_list(target) });
+            }
+        }
+        build_constraints(ultra_seq, ultra_seq_c, metadata);
+        info("  Ultra: par vars=", ultra_par.get_num_variables(), " seq vars=", ultra_seq.get_num_variables());
+
+        size_t ultra_failures = check_semantic_equivalence("recursion Ultra seq-vs-par", ultra_seq, ultra_par);
+        info("  Ultra seq-vs-par: ", ultra_failures, " failures");
+    }
+}
+
+// Test recursion constraint alongside other constraint types in the parallel pipeline.
+// Uses Mega builder for speed. The recursion constraint runs in Phase 4 (sequential),
+// while quads and ranges run in Phase 3 (parallel).
+TEST_F(PerBlockGateCountTests, RecursionWithOtherConstraints)
+{
+    auto [recursion_constraint, rec_witness] = create_honk_recursion_test_data();
+
+    // Build an AcirFormat with: the recursion constraint + some quad constraints + some range constraints.
+    // The quads and ranges use witness indices beyond the recursion witness range.
+    uint32_t rec_max_witness = static_cast<uint32_t>(rec_witness.size() - 1);
+
+    // Create 4 quad constraints using fresh witnesses after the recursion witness range
+    std::vector<QuadConstraint> quads;
+    uint32_t w = rec_max_witness + 1;
+    for (int i = 0; i < 4; i++) {
+        quads.push_back({ .a = w,
+                          .b = w + 1,
+                          .c = w + 2,
+                          .d = w + 3,
+                          .mul_scaling = 1,
+                          .a_scaling = 0,
+                          .b_scaling = 0,
+                          .c_scaling = -1,
+                          .d_scaling = 0,
+                          .const_scaling = 0 });
+        w += 4;
+    }
+
+    // Create 4 range constraints on fresh witnesses
+    std::vector<RangeConstraint> ranges;
+    for (int i = 0; i < 4; i++) {
+        ranges.push_back({ .witness = w, .num_bits = 8 });
+        w++;
+    }
+
+    uint32_t total_witnesses = w;
+
+    // Extend witness vector with valid values for the new constraints
+    WitnessVector witness = rec_witness;
+    witness.resize(total_witnesses, fr(0));
+    // Fill quad witnesses: a*b = c
+    uint32_t qw = rec_max_witness + 1;
+    for (int i = 0; i < 4; i++) {
+        fr a_val = fr::random_element();
+        fr b_val = fr::random_element();
+        witness[qw] = a_val;
+        witness[qw + 1] = b_val;
+        witness[qw + 2] = a_val * b_val;
+        witness[qw + 3] = fr(0);
+        qw += 4;
+    }
+    // Range witnesses: small values that fit in 8 bits
+    for (int i = 0; i < 4; i++) {
+        witness[qw + static_cast<uint32_t>(i)] = fr(42 + i);
+    }
+
+    AcirFormat constraints{};
+    constraints.honk_recursion_constraints = { recursion_constraint };
+    constraints.original_opcode_indices.honk_recursion_constraints = { 0 };
+    constraints.quad_constraints = quads;
+    constraints.original_opcode_indices.quad_constraints = { 1, 2, 3, 4 };
+    constraints.range_constraints = ranges;
+    constraints.original_opcode_indices.range_constraints = { 5, 6, 7, 8 };
+    constraints.num_acir_opcodes = 9;
+    constraints.max_witness_index = total_witnesses - 1;
+
+    ProgramMetadata metadata{};
+
+    // Build with Mega N=1 and N=2
+    AcirFormat n1_constraints = constraints;
+    MegaCircuitBuilder n1_builder{
+        std::make_shared<ECCOpQueue>(), WitnessVector(witness), n1_constraints.public_inputs, false
+    };
+    build_constraints_parallel(n1_builder, n1_constraints, metadata, /*num_threads=*/1);
+
+    AcirFormat n2_constraints = constraints;
+    MegaCircuitBuilder n2_builder{
+        std::make_shared<ECCOpQueue>(), WitnessVector(witness), n2_constraints.public_inputs, false
+    };
+    build_constraints_parallel(n2_builder, n2_constraints, metadata, /*num_threads=*/2);
+
+    info("Recursion+quads+ranges Mega: N1 vars=",
+         n1_builder.get_num_variables(),
+         " N2 vars=",
+         n2_builder.get_num_variables());
+
+    EXPECT_TRUE(CircuitChecker::check(n1_builder)) << "N=1 CircuitChecker failed";
+    EXPECT_TRUE(CircuitChecker::check(n2_builder)) << "N=2 CircuitChecker failed";
+    EXPECT_EQ(n1_builder.get_num_variables(), n2_builder.get_num_variables());
+}
+
+// Find the acir_tests directory relative to the source tree
+std::filesystem::path find_acir_tests_dir()
+{
+    // Walk up from the build dir to find the repo root
+    // The acir_tests are at barretenberg/acir_tests/acir_tests/
+    std::filesystem::path candidate = std::filesystem::current_path();
+    for (int i = 0; i < 10; i++) {
+        auto test_dir = candidate / "barretenberg" / "acir_tests" / "acir_tests";
+        if (std::filesystem::exists(test_dir)) {
+            return test_dir;
+        }
+        candidate = candidate.parent_path();
+    }
+    return {};
+}
+
+// Collect all acir_test directories that have compiled artifacts
+std::vector<std::filesystem::path> collect_acir_test_programs()
+{
+    auto acir_dir = find_acir_tests_dir();
+    if (acir_dir.empty()) {
+        return {};
+    }
+    std::vector<std::filesystem::path> programs;
+    for (const auto& entry : std::filesystem::directory_iterator(acir_dir)) {
+        if (!entry.is_directory())
+            continue;
+        auto program_json = entry.path() / "target" / "program.json";
+        auto witness_gz = entry.path() / "target" / "witness.gz";
+        if (std::filesystem::exists(program_json) && std::filesystem::exists(witness_gz)) {
+            programs.push_back(entry.path());
+        }
+    }
+    std::sort(programs.begin(), programs.end());
+    return programs;
+}
+
+// Check semantic equivalence between two builders: same block sizes, variable counts,
+// copy cycle structure, constants, range lists, and lookup tables.
+// Returns number of failures (0 = all invariants hold).
+size_t check_semantic_equivalence(const std::string& label, UltraCircuitBuilder& a, UltraCircuitBuilder& b)
+{
+    size_t failures = 0;
+
+    // Block sizes
+    auto a_blocks = a.blocks.get();
+    auto b_blocks = b.blocks.get();
+    for (size_t bl = 0; bl < UltraCircuitBuilder::ExecutionTrace::NUM_BLOCKS; bl++) {
+        if (a_blocks[bl].size() > 0 || b_blocks[bl].size() > 0) {
+            bool ok = (a_blocks[bl].size() == b_blocks[bl].size());
+            info(label,
+                 ": block ",
+                 bl,
+                 ": ",
+                 a_blocks[bl].size(),
+                 " vs ",
+                 b_blocks[bl].size(),
+                 ok ? " OK" : " MISMATCH");
+            if (!ok)
+                failures++;
+        }
+    }
+
+    // Variable count
+    {
+        bool ok = (a.get_num_variables() == b.get_num_variables());
+        info(label, ": variables: ", a.get_num_variables(), " vs ", b.get_num_variables(), ok ? " OK" : " MISMATCH");
+        if (!ok)
+            failures++;
+    }
+
+    // Constants
+    {
+        bool ok = (a.constant_variable_indices.size() == b.constant_variable_indices.size());
+        info(label,
+             ": constants: ",
+             a.constant_variable_indices.size(),
+             " vs ",
+             b.constant_variable_indices.size(),
+             ok ? " OK" : " MISMATCH");
+    }
+
+    // Range lists
+    {
+        bool ok = (a.range_lists.size() == b.range_lists.size());
+        info(label, ": range_lists: ", a.range_lists.size(), " vs ", b.range_lists.size(), ok ? " OK" : " MISMATCH");
+    }
+
+    // Lookup tables
+    {
+        bool ok = (a.get_lookup_tables().size() == b.get_lookup_tables().size());
+        info(label,
+             ": lookup_tables: ",
+             a.get_lookup_tables().size(),
+             " vs ",
+             b.get_lookup_tables().size(),
+             ok ? " OK" : " MISMATCH");
+    }
+
+    // Copy cycles: compare as sorted list of (value, cycle_size) pairs.
+    // Each cycle is a set of variables with the same real_variable_index root.
+    // The cycle's "value" is the field element at that root (all vars in the cycle share it).
+    // This checks that the same groups of variables are assert_equal'd, up to reordering.
+    auto collect_cycles = [](const UltraCircuitBuilder& builder) -> std::vector<std::pair<bb::fr, size_t>> {
+        std::map<uint32_t, size_t> root_sizes;
+        for (size_t i = 0; i < builder.get_num_variables(); i++) {
+            root_sizes[builder.real_variable_index[i]]++;
+        }
+        std::vector<std::pair<bb::fr, size_t>> cycles;
+        cycles.reserve(root_sizes.size());
+        for (const auto& [root, sz] : root_sizes) {
+            cycles.emplace_back(builder.get_variable(root), sz);
+        }
+        std::sort(cycles.begin(), cycles.end(), [](const auto& x, const auto& y) {
+            if (x.second != y.second)
+                return x.second < y.second;
+            return x.first < y.first;
+        });
+        return cycles;
+    };
+    auto a_cycles = collect_cycles(a);
+    auto b_cycles = collect_cycles(b);
+    if (a_cycles.size() != b_cycles.size()) {
+        info(label, ": copy cycle count mismatch: ", a_cycles.size(), " vs ", b_cycles.size());
+        failures++;
+    } else {
+        size_t cycle_mismatches = 0;
+        for (size_t i = 0; i < a_cycles.size(); i++) {
+            if (a_cycles[i] != b_cycles[i]) {
+                cycle_mismatches++;
+            }
+        }
+        if (cycle_mismatches > 0) {
+            info(label, ": ", cycle_mismatches, " copy cycle (value, size) mismatches out of ", a_cycles.size());
+            failures++;
+        }
+    }
+
+    // Constants: same set of constant values (not just count)
+    {
+        std::set<bb::fr> a_consts, b_consts;
+        for (const auto& [val, _] : a.constant_variable_indices)
+            a_consts.insert(val);
+        for (const auto& [val, _] : b.constant_variable_indices)
+            b_consts.insert(val);
+        if (a_consts != b_consts) {
+            info(label, ": constant value sets differ: a has ", a_consts.size(), " b has ", b_consts.size());
+            failures++;
+        }
+    }
+
+    // Range lists: same targets, same variable counts per target
+    if (a.range_lists.size() != b.range_lists.size()) {
+        info(label, ": range list count mismatch: ", a.range_lists.size(), " vs ", b.range_lists.size());
+        failures++;
+    }
+    for (const auto& [target, a_rl] : a.range_lists) {
+        auto it = b.range_lists.find(target);
+        if (it == b.range_lists.end()) {
+            info(label, ": range target ", target, " missing from second builder");
+            failures++;
+        } else if (a_rl.variable_indices.size() != it->second.variable_indices.size()) {
+            info(label,
+                 ": range target ",
+                 target,
+                 " variable count mismatch: ",
+                 a_rl.variable_indices.size(),
+                 " vs ",
+                 it->second.variable_indices.size());
+            failures++;
+        }
+    }
+
+    // Gate multiset comparison: for each block, collect all gate tuples (resolved wire values +
+    // selector values), sort them, and compare. This checks that the same gates exist in both
+    // circuits regardless of ordering.
+    {
+        auto a_blks = a.blocks.get();
+        auto b_blks = b.blocks.get();
+        for (size_t bl = 0; bl < UltraCircuitBuilder::ExecutionTrace::NUM_BLOCKS; bl++) {
+            if (a_blks[bl].size() != b_blks[bl].size()) {
+                continue; // already reported as block size mismatch
+            }
+            size_t count = a_blks[bl].size();
+            if (count == 0) {
+                continue;
+            }
+
+            // Collect gate tuples: 4 resolved wire values + all selector values
+            auto a_sels = a_blks[bl].get_selectors();
+            auto b_sels = b_blks[bl].get_selectors();
+            size_t tuple_size = 4 + a_sels.size();
+
+            auto collect_tuples = [&](const auto& blk, const auto& sels, const UltraCircuitBuilder& builder) {
+                std::vector<std::vector<bb::fr>> tuples;
+                tuples.reserve(count);
+                for (size_t i = 0; i < count; i++) {
+                    std::vector<bb::fr> t(tuple_size);
+                    for (size_t w = 0; w < 4; w++) {
+                        t[w] = builder.get_variable(blk.wires[w][i]);
+                    }
+                    for (size_t s = 0; s < sels.size(); s++) {
+                        t[4 + s] = sels[s][i];
+                    }
+                    tuples.push_back(std::move(t));
+                }
+                std::sort(tuples.begin(), tuples.end());
+                return tuples;
+            };
+
+            auto a_tuples = collect_tuples(a_blks[bl], a_sels, a);
+            auto b_tuples = collect_tuples(b_blks[bl], b_sels, b);
+
+            if (a_tuples != b_tuples) {
+                info(label, ": block ", bl, " gate multiset mismatch (", count, " gates)");
+                // Find first difference
+                size_t a_only = 0;
+                size_t b_only = 0;
+                size_t ai = 0;
+                size_t bi = 0;
+                while (ai < a_tuples.size() && bi < b_tuples.size()) {
+                    if (a_tuples[ai] == b_tuples[bi]) {
+                        ai++;
+                        bi++;
+                    } else if (a_tuples[ai] < b_tuples[bi]) {
+                        a_only++;
+                        ai++;
+                    } else {
+                        b_only++;
+                        bi++;
+                    }
+                }
+                a_only += a_tuples.size() - ai;
+                b_only += b_tuples.size() - bi;
+                info(label, ": block ", bl, " a_only=", a_only, " b_only=", b_only);
+                // Print first few differing tuples from each side
+                ai = 0;
+                bi = 0;
+                size_t printed_a = 0;
+                size_t printed_b = 0;
+                while (ai < a_tuples.size() && bi < b_tuples.size() && (printed_a < 3 || printed_b < 3)) {
+                    if (a_tuples[ai] == b_tuples[bi]) {
+                        ai++;
+                        bi++;
+                    } else if (a_tuples[ai] < b_tuples[bi]) {
+                        if (printed_a < 3) {
+                            info("    a_only[",
+                                 printed_a,
+                                 "]: w0=",
+                                 a_tuples[ai][0],
+                                 " w1=",
+                                 a_tuples[ai][1],
+                                 " w2=",
+                                 a_tuples[ai][2],
+                                 " w3=",
+                                 a_tuples[ai][3]);
+                            printed_a++;
+                        }
+                        ai++;
+                    } else {
+                        if (printed_b < 3) {
+                            info("    b_only[",
+                                 printed_b,
+                                 "]: w0=",
+                                 b_tuples[bi][0],
+                                 " w1=",
+                                 b_tuples[bi][1],
+                                 " w2=",
+                                 b_tuples[bi][2],
+                                 " w3=",
+                                 b_tuples[bi][3]);
+                            printed_b++;
+                        }
+                        bi++;
+                    }
+                }
+                failures++;
+            }
+        }
+    }
+
+    // Lookup tables
+    if (a.get_lookup_tables().size() != b.get_lookup_tables().size()) {
+        info(label,
+             ": lookup table count mismatch: ",
+             a.get_lookup_tables().size(),
+             " vs ",
+             b.get_lookup_tables().size());
+        failures++;
+    }
+
+    return failures;
+}
+
+// Check bit-identical circuits (every wire, selector, variable, and union-find entry must match).
+// Returns number of mismatches (0 = identical).
+size_t check_bit_identical(const std::string& label, UltraCircuitBuilder& a, UltraCircuitBuilder& b)
+{
+    size_t mismatches = 0;
+
+    auto a_blocks = a.blocks.get();
+    auto b_blocks = b.blocks.get();
+    for (size_t bl = 0; bl < UltraCircuitBuilder::ExecutionTrace::NUM_BLOCKS; bl++) {
+        if (a_blocks[bl].size() != b_blocks[bl].size()) {
+            info(label, ": block ", bl, " size mismatch: ", a_blocks[bl].size(), " vs ", b_blocks[bl].size());
+            mismatches++;
+            continue;
+        }
+        size_t count = a_blocks[bl].size();
+        for (size_t w = 0; w < 4; w++) {
+            for (size_t i = 0; i < count; i++) {
+                if (a_blocks[bl].wires[w][i] != b_blocks[bl].wires[w][i])
+                    mismatches++;
+            }
+        }
+        auto a_sels = a_blocks[bl].get_selectors();
+        auto b_sels = b_blocks[bl].get_selectors();
+        for (size_t s = 0; s < a_sels.size(); s++) {
+            for (size_t i = 0; i < count; i++) {
+                if (a_sels[s][i] != b_sels[s][i])
+                    mismatches++;
+            }
+        }
+    }
+
+    if (a.get_num_variables() != b.get_num_variables()) {
+        info(label, ": variable count mismatch");
+        mismatches++;
+    } else {
+        for (size_t i = 0; i < a.get_num_variables(); i++) {
+            if (a.real_variable_index[i] != b.real_variable_index[i])
+                mismatches++;
+        }
+    }
+
+    return mismatches;
+}
+
+// Parameterized test that runs the 3-way comparison on every acir_test program.
+class AcirTestParallelEquivalence : public ::testing::TestWithParam<std::filesystem::path> {
+  protected:
+    static void SetUpTestSuite() { bb::srs::init_file_crs_factory(bb::srs::bb_crs_path()); }
+};
+
+TEST_P(AcirTestParallelEquivalence, SequentialN1N2)
+{
+    auto test_dir = GetParam();
+    std::string test_name = test_dir.filename().string();
+    auto program_path = test_dir / "target" / "program.json";
+    auto witness_path = test_dir / "target" / "witness.gz";
+
+    // Load bytecode and witness
+    auto bytecode = get_bytecode(program_path.string());
+    AcirFormat constraints = circuit_buf_to_acir_format(std::move(bytecode));
+    auto witness_buf = gunzip(witness_path.string());
+    WitnessVector witness = witness_buf_to_witness_vector(std::move(witness_buf));
+
+    // Print constraint breakdown for diagnostics
+    info("  quad=",
+         constraints.quad_constraints.size(),
+         " big_quad=",
+         constraints.big_quad_constraints.size(),
+         " logic=",
+         constraints.logic_constraints.size(),
+         " range=",
+         constraints.range_constraints.size(),
+         " sha256=",
+         constraints.sha256_compression.size(),
+         " ecdsa_k1=",
+         constraints.ecdsa_k1_constraints.size(),
+         " ecdsa_r1=",
+         constraints.ecdsa_r1_constraints.size(),
+         " poseidon2=",
+         constraints.poseidon2_constraints.size(),
+         " block=",
+         constraints.block_constraints.size(),
+         " msm=",
+         constraints.multi_scalar_mul_constraints.size(),
+         " ec_add=",
+         constraints.ec_add_constraints.size(),
+         " aes128=",
+         constraints.aes128_constraints.size());
+
+    // Skip circuits with no parallelizable constraints (e.g., brillig-only programs)
+    bool has_constraints = !constraints.quad_constraints.empty() || !constraints.big_quad_constraints.empty() ||
+                           !constraints.logic_constraints.empty() || !constraints.range_constraints.empty() ||
+                           !constraints.sha256_compression.empty() || !constraints.ecdsa_k1_constraints.empty() ||
+                           !constraints.ecdsa_r1_constraints.empty() || !constraints.poseidon2_constraints.empty() ||
+                           !constraints.multi_scalar_mul_constraints.empty() ||
+                           !constraints.ec_add_constraints.empty() || !constraints.aes128_constraints.empty() ||
+                           !constraints.blake2s_constraints.empty() || !constraints.blake3_constraints.empty() ||
+                           !constraints.keccak_permutations.empty();
+    if (!has_constraints) {
+        GTEST_SKIP() << "No parallelizable constraints";
+    }
+
+    // Skip recursion programs (need pre-computed proof data not available in this test)
+    if (!constraints.honk_recursion_constraints.empty() || !constraints.avm_recursion_constraints.empty() ||
+        !constraints.hn_recursion_constraints.empty() || !constraints.chonk_recursion_constraints.empty()) {
+        GTEST_SKIP() << "Recursion constraints not supported in this test";
+    }
+
+    // 1. Build sequentially via create_circuit (uses build_constraints)
+    AcirProgram seq_program{ constraints, WitnessVector(witness) };
+    auto seq_builder = create_circuit<UltraCircuitBuilder>(seq_program, ProgramMetadata{});
+
+    // 2. Build via parallel path with N=1
+    AcirFormat n1_constraints = constraints;
+    UltraCircuitBuilder n1_builder{ WitnessVector(witness), n1_constraints.public_inputs, false };
+    build_constraints_parallel(n1_builder, n1_constraints, ProgramMetadata{}, /*num_threads=*/1);
+
+    // 3. Build via parallel path with N=2
+    AcirFormat n2_constraints = constraints;
+    UltraCircuitBuilder n2_builder{ WitnessVector(witness), n2_constraints.public_inputs, false };
+    build_constraints_parallel(n2_builder, n2_constraints, ProgramMetadata{}, /*num_threads=*/2);
+
+    // Print block sizes for all three builders
+    {
+        auto sb = seq_builder.blocks.get();
+        auto n1b = n1_builder.blocks.get();
+        auto n2b = n2_builder.blocks.get();
+        for (size_t bl = 0; bl < UltraCircuitBuilder::ExecutionTrace::NUM_BLOCKS; bl++) {
+            if (sb[bl].size() > 0 || n1b[bl].size() > 0 || n2b[bl].size() > 0) {
+                info("  block ", bl, ": seq=", sb[bl].size(), " n1=", n1b[bl].size(), " n2=", n2b[bl].size());
+            }
+        }
+        info("  vars: seq=",
+             seq_builder.get_num_variables(),
+             " n1=",
+             n1_builder.get_num_variables(),
+             " n2=",
+             n2_builder.get_num_variables());
+        info("  constants: seq=",
+             seq_builder.constant_variable_indices.size(),
+             " n1=",
+             n1_builder.constant_variable_indices.size(),
+             " n2=",
+             n2_builder.constant_variable_indices.size());
+        info("  range_lists: seq=",
+             seq_builder.range_lists.size(),
+             " n1=",
+             n1_builder.range_lists.size(),
+             " n2=",
+             n2_builder.range_lists.size());
+        for (const auto& [target, rl] : seq_builder.range_lists) {
+            auto n1_it = n1_builder.range_lists.find(target);
+            size_t n1_count = (n1_it != n1_builder.range_lists.end()) ? n1_it->second.variable_indices.size() : 0;
+            info("    range ", target, ": seq=", rl.variable_indices.size(), " n1=", n1_count);
+        }
+        // Check for range lists in n1 that aren't in seq
+        for (const auto& [target, rl] : n1_builder.range_lists) {
+            if (seq_builder.range_lists.find(target) == seq_builder.range_lists.end()) {
+                info("    range ", target, ": seq=MISSING n1=", rl.variable_indices.size());
+            }
+        }
+    }
+
+    // All three must pass circuit checker
+    bool seq_ok = CircuitChecker::check(seq_builder);
+    bool n1_ok = CircuitChecker::check(n1_builder);
+    bool n2_ok = CircuitChecker::check(n2_builder);
+    EXPECT_TRUE(seq_ok) << test_name << ": sequential CircuitChecker failed";
+    EXPECT_TRUE(n1_ok) << test_name << ": N=1 CircuitChecker failed";
+    EXPECT_TRUE(n2_ok) << test_name << ": N=2 CircuitChecker failed";
+
+    // Sequential vs N=1: semantic equivalence (same constraints, different order)
+    size_t seq_n1_failures = check_semantic_equivalence(test_name + " seq-vs-n1", seq_builder, n1_builder);
+    EXPECT_EQ(seq_n1_failures, 0) << test_name << ": sequential vs N=1 semantic equivalence failed";
+
+    // Sequential vs N=2: semantic equivalence
+    size_t seq_n2_failures = check_semantic_equivalence(test_name + " seq-vs-n2", seq_builder, n2_builder);
+    EXPECT_EQ(seq_n2_failures, 0) << test_name << ": sequential vs N=2 semantic equivalence failed";
+
+    // N=1 vs N=2: must be bit-identical
+    size_t n1_n2_mismatches = check_bit_identical(test_name + " n1-vs-n2", n1_builder, n2_builder);
+    if (n1_n2_mismatches > 0) {
+        // Print first few wire mismatches
+        auto n1b = n1_builder.blocks.get();
+        auto n2b = n2_builder.blocks.get();
+        size_t printed = 0;
+        for (size_t b = 0; b < UltraCircuitBuilder::ExecutionTrace::NUM_BLOCKS && printed < 5; b++) {
+            size_t count = std::min(n1b[b].size(), n2b[b].size());
+            for (size_t w = 0; w < 4 && printed < 5; w++) {
+                for (size_t i = 0; i < count && printed < 5; i++) {
+                    if (n1b[b].wires[w][i] != n2b[b].wires[w][i]) {
+                        info("  WIRE DIFF block=",
+                             b,
+                             " gate=",
+                             i,
+                             " wire=",
+                             w,
+                             " n1=",
+                             n1b[b].wires[w][i],
+                             " n2=",
+                             n2b[b].wires[w][i]);
+                        printed++;
+                    }
+                }
+            }
+        }
+        // Print first few real_variable_index mismatches
+        size_t num_vars = std::min(n1_builder.get_num_variables(), n2_builder.get_num_variables());
+        printed = 0;
+        for (size_t i = 0; i < num_vars && printed < 5; i++) {
+            if (n1_builder.real_variable_index[i] != n2_builder.real_variable_index[i]) {
+                info("  REAL_VAR_IDX DIFF var=",
+                     i,
+                     " n1=",
+                     n1_builder.real_variable_index[i],
+                     " n2=",
+                     n2_builder.real_variable_index[i]);
+                printed++;
+            }
+        }
+    }
+    EXPECT_EQ(n1_n2_mismatches, 0) << test_name << ": N=1 vs N=2 bit-identical check failed";
+}
+
+INSTANTIATE_TEST_SUITE_P(AcirTests,
+                         AcirTestParallelEquivalence,
+                         ::testing::ValuesIn(collect_acir_test_programs()),
+                         [](const ::testing::TestParamInfo<std::filesystem::path>& info) {
+                             return info.param.filename().string();
+                         });
diff --git a/barretenberg/cpp/src/barretenberg/honk/execution_trace/execution_trace_block.hpp b/barretenberg/cpp/src/barretenberg/honk/execution_trace/execution_trace_block.hpp
index 609c2c380358..47be60cb4c38 100644
--- a/barretenberg/cpp/src/barretenberg/honk/execution_trace/execution_trace_block.hpp
+++ b/barretenberg/cpp/src/barretenberg/honk/execution_trace/execution_trace_block.hpp
@@ -19,6 +19,19 @@
 
 namespace bb {
 
+// Thread-local index for parallel circuit construction. Used by Selector, ExecutionTraceBlock,
+// and CircuitBuilderBase to route operations through per-thread cursors.
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+inline thread_local size_t parallel_thread_idx = 0;
+inline void set_parallel_thread_index(size_t idx)
+{
+    parallel_thread_idx = idx;
+}
+inline size_t get_parallel_thread_index()
+{
+    return parallel_thread_idx;
+}
+
 #ifdef CHECK_CIRCUIT_STACKTRACES
 struct BbStackTrace : backward::StackTrace {
     BbStackTrace() { load_here(32); }
@@ -119,6 +132,46 @@ template <typename FF> class Selector {
      * @brief Release all memory held by this selector.
      */
     virtual void free_memory() {}
+
+    /**
+     * @brief Enable cursor mode for a specific thread.
+     * @details Used for parallel circuit construction where blocks are pre-allocated and threads write at
+     * pre-determined offsets. The underlying storage must already be sized to accommodate the writes.
+     * Thread index is set via set_active_thread_index() before processing opcodes.
+     */
+    void enable_cursor_mode(size_t thread_idx, size_t start)
+    {
+        if (thread_idx >= cursors_.size()) {
+            cursors_.resize(thread_idx + 1, CURSOR_DISABLED);
+        }
+        cursors_[thread_idx] = start;
+    }
+
+    // Legacy single-thread interface (uses thread index 0)
+    void enable_cursor_mode(size_t start) { enable_cursor_mode(0, start); }
+
+    void disable_cursor_mode(size_t thread_idx)
+    {
+        if (thread_idx < cursors_.size()) {
+            cursors_[thread_idx] = CURSOR_DISABLED;
+        }
+    }
+    void disable_cursor_mode() { disable_cursor_mode(0); }
+
+    bool is_cursor_mode() const { return active_cursor() != CURSOR_DISABLED; }
+
+    size_t active_cursor() const
+    {
+        auto idx = get_parallel_thread_index();
+        return (cursors_.empty() || idx >= cursors_.size()) ? CURSOR_DISABLED : cursors_[idx];
+    }
+
+    size_t& active_cursor_ref() { return cursors_[get_parallel_thread_index()]; }
+
+    static constexpr size_t CURSOR_DISABLED = std::numeric_limits<size_t>::max();
+
+  protected:
+    std::vector<size_t> cursors_; // per-thread cursors
 };
 
 /**
@@ -134,13 +187,21 @@ template <typename FF> class ZeroSelector : public Selector<FF> {
     void emplace_back(int value) override
     {
         BB_ASSERT_EQ(value, 0, "Calling ZeroSelector::emplace_back with a non zero value.");
-        size_++;
+        if (this->is_cursor_mode()) {
+            this->active_cursor_ref()++;
+        } else {
+            size_++;
+        }
     }
 
     void push_back(const FF& value) override
     {
         BB_ASSERT(value.is_zero());
-        size_++;
+        if (this->is_cursor_mode()) {
+            this->active_cursor_ref()++;
+        } else {
+            size_++;
+        }
     }
 
     void set(size_t, int) override { BB_ASSERT(false, "ZeroSelector::set should not be called"); }
@@ -179,8 +240,22 @@ template <typename FF> class SlabVectorSelector : public Selector<FF> {
   public:
     using Selector<FF>::emplace_back;
 
-    void emplace_back(int i) override { data.emplace_back(i); }
-    void push_back(const FF& value) override { data.push_back(value); }
+    void emplace_back(int i) override
+    {
+        if (this->is_cursor_mode()) {
+            data[this->active_cursor_ref()++] = i;
+        } else {
+            data.emplace_back(i);
+        }
+    }
+    void push_back(const FF& value) override
+    {
+        if (this->is_cursor_mode()) {
+            data[this->active_cursor_ref()++] = value;
+        } else {
+            data.push_back(value);
+        }
+    }
     void set(size_t idx, int i) override { data[idx] = i; }
     void set(size_t idx, const FF& value) override { data[idx] = value; }
     void resize(size_t new_size) override { data.resize(new_size); }
@@ -246,6 +321,14 @@ template <typename FF, size_t NUM_WIRES_> class ExecutionTraceBlock {
     size_t cached_size_ = 0;                                       // set by free_data() so size() works after freeing
     bool data_freed_ = false;                                      // true after free_data() has been called
     uint32_t trace_offset_ = std::numeric_limits<uint32_t>::max(); // where this block starts in the trace
+    std::vector<size_t> wire_cursors_;                             // per-thread wire cursors
+
+    size_t wire_active_cursor() const
+    {
+        auto idx = get_parallel_thread_index();
+        return (wire_cursors_.empty() || idx >= wire_cursors_.size()) ? Selector<FF>::CURSOR_DISABLED
+                                                                      : wire_cursors_[idx];
+    }
 
     uint32_t trace_offset() const
     {
@@ -257,6 +340,65 @@ template <typename FF, size_t NUM_WIRES_> class ExecutionTraceBlock {
 
     size_t size() const { return data_freed_ ? cached_size_ : std::get<0>(this->wires).size(); }
 
+    /**
+     * @brief Get the index of the gate most recently written (via populate_wires).
+     * @details In cursor mode, populate_wires writes at the cursor and then increments it,
+     * so the last gate is at cursor - 1. In normal mode, it's size() - 1 as usual.
+     * Must be called immediately after populate_wires (before any other writes to this block).
+     */
+    size_t last_gate_index() const
+    {
+        size_t wc = wire_active_cursor();
+        if (wc != Selector<FF>::CURSOR_DISABLED) {
+            return wc - 1;
+        }
+        return size() - 1;
+    }
+
+    /**
+     * @brief Get the index where the next gate will be written.
+     * @details In cursor mode, returns the current cursor position. In normal mode, returns size().
+     */
+    size_t next_gate_index() const
+    {
+        size_t wc = wire_active_cursor();
+        if (wc != Selector<FF>::CURSOR_DISABLED) {
+            return wc;
+        }
+        return size();
+    }
+
+    /**
+     * @brief Enable cursor mode for a thread: subsequent gate writes go to position `start` and advance.
+     * @details The block's wires and selectors must already be sized to accommodate the writes.
+     * Used for parallel circuit construction where threads write to pre-allocated regions.
+     */
+    void enable_cursor_mode(size_t thread_idx, size_t start)
+    {
+        if (thread_idx >= wire_cursors_.size()) {
+            wire_cursors_.resize(thread_idx + 1, Selector<FF>::CURSOR_DISABLED);
+        }
+        wire_cursors_[thread_idx] = start;
+        for (auto& sel : get_selectors()) {
+            sel.enable_cursor_mode(thread_idx, start);
+        }
+    }
+
+    // Legacy single-thread interface
+    void enable_cursor_mode(size_t start) { enable_cursor_mode(0, start); }
+
+    void disable_cursor_mode(size_t thread_idx)
+    {
+        if (thread_idx < wire_cursors_.size()) {
+            wire_cursors_[thread_idx] = Selector<FF>::CURSOR_DISABLED;
+        }
+        for (auto& sel : get_selectors()) {
+            sel.disable_cursor_mode(thread_idx);
+        }
+    }
+
+    void disable_cursor_mode() { disable_cursor_mode(0); }
+
 #ifdef TRACY_HACK_GATES_AS_MEMORY
     ~ExecutionTraceBlock()
     {
@@ -295,10 +437,19 @@ template <typename FF, size_t NUM_WIRES_> class ExecutionTraceBlock {
         this->stack_traces.populate();
 #endif
         this->tracy_gate();
-        this->wires[0].emplace_back(idx_1);
-        this->wires[1].emplace_back(idx_2);
-        this->wires[2].emplace_back(idx_3);
-        this->wires[3].emplace_back(idx_4);
+        size_t wc = wire_active_cursor();
+        if (wc != Selector<FF>::CURSOR_DISABLED) {
+            this->wires[0][wc] = idx_1;
+            this->wires[1][wc] = idx_2;
+            this->wires[2][wc] = idx_3;
+            this->wires[3][wc] = idx_4;
+            wire_cursors_[get_parallel_thread_index()]++;
+        } else {
+            this->wires[0].emplace_back(idx_1);
+            this->wires[1].emplace_back(idx_2);
+            this->wires[2].emplace_back(idx_3);
+            this->wires[3].emplace_back(idx_4);
+        }
     }
 
     auto& w_l() { return std::get<0>(this->wires); };
diff --git a/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/circuit_builder_base.hpp b/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/circuit_builder_base.hpp
index 3a556e5a6309..c9b031bc0716 100644
--- a/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/circuit_builder_base.hpp
+++ b/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/circuit_builder_base.hpp
@@ -9,6 +9,7 @@
 #include "barretenberg/ecc/curves/bn254/bn254.hpp"
 #include "barretenberg/ecc/curves/bn254/fr.hpp"
 #include "barretenberg/ecc/curves/grumpkin/grumpkin.hpp"
+#include "barretenberg/honk/execution_trace/execution_trace_block.hpp" // for get_parallel_thread_index
 #include "barretenberg/honk/execution_trace/gate_data.hpp"
 #include "barretenberg/public_input_component/public_component_key.hpp"
 #include "barretenberg/serialize/msgpack.hpp"
@@ -52,6 +53,44 @@ template <typename FF_> class CircuitBuilderBase {
 
     size_t _num_gates = 0;
 
+  public:
+    // Cursor for parallel variable allocation. When enabled, add_variable writes at cursor position
+    // instead of appending. The variable vectors must be pre-sized to accommodate the writes.
+    static constexpr uint32_t VARIABLE_CURSOR_DISABLED = UINT32_MAX;
+
+    // Deferred assert_equal entries for parallel construction. In cursor mode, assert_equal calls
+    // are recorded per-task and replayed in deterministic task order after all threads join.
+    // This prevents nondeterministic union-find results when multiple threads assert_equal on
+    // the same shared ACIR witness.
+    struct DeferredAssertEqual {
+        uint32_t a_variable_idx;
+        uint32_t b_variable_idx;
+        std::string msg;
+        bool operator==(const DeferredAssertEqual&) const = default;
+    };
+    std::vector<std::vector<DeferredAssertEqual>> deferred_assert_equals_; // per-task
+
+    void init_deferred_assert_equal_buffers(size_t num_tasks) { deferred_assert_equals_.resize(num_tasks); }
+
+    // Set which task index the current thread is executing (for assert_equal deferral).
+    // Thread-local so concurrent threads don't overwrite each other's task index.
+    void set_current_task_index(size_t task_idx) { current_task_idx_ = task_idx; }
+    static inline thread_local size_t current_task_idx_ = 0;
+
+    void apply_deferred_assert_equals()
+    {
+        // Replay in task order (0, 1, 2, ...) for deterministic union-find results
+        for (auto& task_buf : deferred_assert_equals_) {
+            for (auto& entry : task_buf) {
+                assert_equal(entry.a_variable_idx, entry.b_variable_idx, entry.msg);
+            }
+            task_buf.clear();
+        }
+    }
+
+  private:
+    std::vector<uint32_t> variable_cursors_; // per-thread variable cursors
+
     /**
      * @brief Update all variables from index in equivalence class to have real variable new_real_index
      * @param index The index of a variable in the class we're updating
@@ -144,6 +183,10 @@ template <typename FF_> class CircuitBuilderBase {
     void increment_num_gates(size_t count = 1)
     {
         BB_ASSERT(!circuit_finalized, "Cannot add gates after circuit is finalized");
+        // In cursor mode, gate count is pre-computed; skip to avoid races in parallel construction
+        if (get_variable_cursor() != VARIABLE_CURSOR_DISABLED) {
+            return;
+        }
         _num_gates += count;
     }
 
@@ -188,6 +231,8 @@ template <typename FF_> class CircuitBuilderBase {
     }
 
     const std::vector<uint32_t>& public_inputs() const { return _public_inputs; };
+    const std::vector<uint32_t>& get_next_var_index() const { return next_var_index; }
+    const std::vector<uint32_t>& get_prev_var_index() const { return prev_var_index; }
 
     /**
      * @brief Set the _public_inputs_finalized to true to prevent any new public inputs from being added
@@ -211,6 +256,50 @@ template <typename FF_> class CircuitBuilderBase {
      */
     virtual uint32_t add_variable(const FF& in);
 
+    /**
+     * @brief Enable variable cursor mode for parallel construction.
+     * @details When enabled, add_variable writes at the cursor position instead of appending.
+     * The variables/real_variable_index/next_var_index/prev_var_index/real_variable_tags vectors
+     * must be pre-sized to accommodate the writes.
+     */
+    void enable_variable_cursor(size_t thread_idx, uint32_t start)
+    {
+        if (thread_idx >= variable_cursors_.size()) {
+            variable_cursors_.resize(thread_idx + 1, VARIABLE_CURSOR_DISABLED);
+        }
+        variable_cursors_[thread_idx] = start;
+    }
+    // Legacy single-thread interface
+    void enable_variable_cursor(uint32_t start) { enable_variable_cursor(0, start); }
+
+    void disable_variable_cursor(size_t thread_idx)
+    {
+        if (thread_idx < variable_cursors_.size()) {
+            variable_cursors_[thread_idx] = VARIABLE_CURSOR_DISABLED;
+        }
+    }
+    void disable_variable_cursor() { disable_variable_cursor(0); }
+
+    uint32_t get_variable_cursor() const
+    {
+        auto idx = get_parallel_thread_index();
+        return (variable_cursors_.empty() || idx >= variable_cursors_.size()) ? VARIABLE_CURSOR_DISABLED
+                                                                              : variable_cursors_[idx];
+    }
+
+    /**
+     * @brief Pre-allocate variable storage for parallel construction.
+     * @param total_size The total number of variables (existing + new from all threads).
+     */
+    void resize_variables(size_t total_size)
+    {
+        variables.resize(total_size);
+        real_variable_index.resize(total_size);
+        next_var_index.resize(total_size);
+        prev_var_index.resize(total_size);
+        real_variable_tags.resize(total_size);
+    }
+
     // Disallow add_variable for non-FF types to prevent implicit conversions (specifically, using indices rather
     // than values)
     template <typename OT> uint32_t add_variable(const OT& in) = delete;
diff --git a/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/circuit_builder_base_impl.hpp b/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/circuit_builder_base_impl.hpp
index 2571bf23179a..178c60309114 100644
--- a/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/circuit_builder_base_impl.hpp
+++ b/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/circuit_builder_base_impl.hpp
@@ -45,6 +45,17 @@ void CircuitBuilderBase<FF_>::update_real_variable_indices(uint32_t index, uint3
 
 template <typename FF_> uint32_t CircuitBuilderBase<FF_>::add_variable(const FF& in)
 {
+    uint32_t cursor = get_variable_cursor();
+    if (cursor != VARIABLE_CURSOR_DISABLED) {
+        auto thread_idx = get_parallel_thread_index();
+        const uint32_t index = variable_cursors_[thread_idx]++;
+        variables[index] = in;
+        real_variable_index[index] = index;
+        next_var_index[index] = REAL_VARIABLE;
+        prev_var_index[index] = FIRST_VARIABLE_IN_CLASS;
+        real_variable_tags[index] = DEFAULT_TAG;
+        return index;
+    }
     variables.emplace_back(in);
     const uint32_t index = static_cast<uint32_t>(variables.size()) - 1U;
     real_variable_index.emplace_back(index);
@@ -114,6 +125,13 @@ void CircuitBuilderBase<FF>::assert_equal(const uint32_t a_variable_idx,
                                           const uint32_t b_variable_idx,
                                           std::string const& msg)
 {
+    // In cursor mode, defer assert_equal to avoid nondeterministic union-find results
+    // when multiple threads modify chains rooted at the same shared witness.
+    // Deferred entries are stored per-task and replayed in task order after all threads join.
+    if (get_variable_cursor() != VARIABLE_CURSOR_DISABLED) {
+        deferred_assert_equals_[current_task_idx_].push_back({ a_variable_idx, b_variable_idx, msg });
+        return;
+    }
     assert_valid_variables({ a_variable_idx, b_variable_idx });
     bool values_equal = (get_variable(a_variable_idx) == get_variable(b_variable_idx));
     if (!values_equal && !failed()) {
diff --git a/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/rom_ram_logic.cpp b/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/rom_ram_logic.cpp
index a4ff064e1c94..a0bdffdcf91f 100644
--- a/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/rom_ram_logic.cpp
+++ b/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/rom_ram_logic.cpp
@@ -13,6 +13,14 @@ namespace bb {
 
 template <typename ExecutionTrace> size_t RomRamLogic_<ExecutionTrace>::create_ROM_array(const size_t array_size)
 {
+    // In cursor mode (parallel construction), arrays are pre-created during setup.
+    // Return the next pre-assigned ID for this thread and advance the cursor.
+    if (rom_cursor_active()) {
+        size_t id = rom_id_cursors_[get_parallel_thread_index()]++;
+        BB_ASSERT(id < rom_arrays.size());
+        BB_ASSERT(rom_arrays[id].state.size() == array_size);
+        return id;
+    }
     RomTranscript new_transcript;
     for (size_t i = 0; i < array_size; ++i) {
         new_transcript.state.emplace_back(
@@ -158,7 +166,7 @@ void RomRamLogic_<ExecutionTrace>::create_ROM_gate(CircuitBuilder* builder, RomR
     builder->blocks.memory.populate_wires(
         record.index_witness, record.value_column1_witness, record.value_column2_witness, record.record_witness);
     // Note: record the index into the memory block that contains the RAM/ROM gates
-    record.gate_index = builder->blocks.memory.size() - 1;
+    record.gate_index = builder->blocks.memory.last_gate_index();
     builder->check_selector_length_consistency();
     builder->increment_num_gates();
 }
@@ -173,7 +181,7 @@ void RomRamLogic_<ExecutionTrace>::create_sorted_ROM_gate(CircuitBuilder* builde
     builder->blocks.memory.populate_wires(
         record.index_witness, record.value_column1_witness, record.value_column2_witness, record.record_witness);
     // Note: record the index into the memory block that contains the RAM/ROM gates
-    record.gate_index = builder->blocks.memory.size() - 1;
+    record.gate_index = builder->blocks.memory.last_gate_index();
     builder->check_selector_length_consistency();
     builder->increment_num_gates();
 }
@@ -276,6 +284,13 @@ template <typename ExecutionTrace> void RomRamLogic_<ExecutionTrace>::process_RO
 
 template <typename ExecutionTrace> size_t RomRamLogic_<ExecutionTrace>::create_RAM_array(const size_t array_size)
 {
+    // In cursor mode (parallel construction), arrays are pre-created during setup.
+    if (ram_cursor_active()) {
+        size_t id = ram_id_cursors_[get_parallel_thread_index()]++;
+        BB_ASSERT(id < ram_arrays.size());
+        BB_ASSERT(ram_arrays[id].state.size() == array_size);
+        return id;
+    }
     RamTranscript new_transcript;
     for (size_t i = 0; i < array_size; ++i) {
         new_transcript.state.emplace_back(UNINITIALIZED_MEMORY_RECORD);
@@ -418,7 +433,7 @@ void RomRamLogic_<ExecutionTrace>::create_RAM_gate(CircuitBuilder* builder, RamR
         record.index_witness, record.timestamp_witness, record.value_witness, record.record_witness);
 
     // Note: record the index into the block that contains the RAM/ROM gates
-    record.gate_index = builder->blocks.memory.size() - 1;
+    record.gate_index = builder->blocks.memory.last_gate_index();
     builder->increment_num_gates();
 }
 
@@ -430,7 +445,7 @@ void RomRamLogic_<ExecutionTrace>::create_sorted_RAM_gate(CircuitBuilder* builde
     builder->blocks.memory.populate_wires(
         record.index_witness, record.timestamp_witness, record.value_witness, record.record_witness);
     // Note: record the index into the memory block that contains the RAM/ROM gates
-    record.gate_index = builder->blocks.memory.size() - 1;
+    record.gate_index = builder->blocks.memory.last_gate_index();
     builder->check_selector_length_consistency();
     builder->increment_num_gates();
 }
@@ -442,7 +457,7 @@ void RomRamLogic_<ExecutionTrace>::create_final_sorted_RAM_gate(CircuitBuilder*
 {
     record.record_witness = builder->add_variable(FF(0));
     // Note: record the index into the block that contains the RAM/ROM gates
-    record.gate_index = builder->blocks.memory.size(); // no -1 since we _haven't_ added the gate yet
+    record.gate_index = builder->blocks.memory.next_gate_index(); // index where the gate _will_ be written
 
     // Create a final gate with all selectors zero (hence unconstrained). In particular, the `MEMORY_SELECTORS` are not
     // on. Wire values are accessed by the previous RAM gate via shifted wires.
diff --git a/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/rom_ram_logic.hpp b/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/rom_ram_logic.hpp
index 999ada11850a..8e405a88d144 100644
--- a/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/rom_ram_logic.hpp
+++ b/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/rom_ram_logic.hpp
@@ -149,6 +149,29 @@ template <typename ExecutionTrace> class RomRamLogic_ {
      */
     std::vector<RomTranscript> rom_arrays;
 
+    // Per-thread ROM/RAM ID cursors for parallel construction. When enabled, create_ROM_array/
+    // create_RAM_array return the cursor value and increment it instead of pushing to the vectors.
+    // Arrays must be pre-created during the setup phase before enabling cursors.
+    std::vector<size_t> rom_id_cursors_; // per-thread
+    std::vector<size_t> ram_id_cursors_; // per-thread
+
+    void enable_rom_cursor(size_t thread_idx, size_t start)
+    {
+        if (thread_idx >= rom_id_cursors_.size()) {
+            rom_id_cursors_.resize(thread_idx + 1, 0);
+        }
+        rom_id_cursors_[thread_idx] = start;
+    }
+    void enable_ram_cursor(size_t thread_idx, size_t start)
+    {
+        if (thread_idx >= ram_id_cursors_.size()) {
+            ram_id_cursors_.resize(thread_idx + 1, 0);
+        }
+        ram_id_cursors_[thread_idx] = start;
+    }
+    bool rom_cursor_active() const { return !rom_id_cursors_.empty(); }
+    bool ram_cursor_active() const { return !ram_id_cursors_.empty(); }
+
     RomRamLogic_() = default;
 
     // ROM operations
diff --git a/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/ultra_circuit_builder.cpp b/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/ultra_circuit_builder.cpp
index da88199681e8..1cc12be2a975 100644
--- a/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/ultra_circuit_builder.cpp
+++ b/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/ultra_circuit_builder.cpp
@@ -378,15 +378,25 @@ void UltraCircuitBuilder_<ExecutionTrace>::create_ecc_add_gate(const ecc_add_gat
     // The elliptic curve relation assumes q_sign² = 1 (see elliptic_relation.hpp)
     const FF q_sign = in.is_addition ? FF(1) : FF(-1);
 
-    // Determine whether we can fuse this addition operation into the previous gate in the block
-    bool can_fuse_into_previous_gate =
-        block.size() > 0 &&                       /* a previous gate exists in the block */
-        block.w_r()[block.size() - 1] == in.x1 && /* output x coord of previous gate is input of this one */
-        block.w_o()[block.size() - 1] == in.y1;   /* output y coord of previous gate is input of this one */
+    // Determine whether we can fuse this addition into the previous gate in the block.
+    // In cursor mode, use cursor position (not block.size() which returns pre-allocated total).
+    // NOTE: For future work-stealing parallelism where task execution order may differ, fusion
+    // across task boundaries must be handled carefully to maintain determinism.
+    size_t cursor = block.wire_active_cursor();
+    bool can_fuse_into_previous_gate;
+    size_t prev_idx;
+    if (cursor != Selector<FF>::CURSOR_DISABLED) {
+        prev_idx = cursor - 1;
+        can_fuse_into_previous_gate = cursor > 0 && block.w_r()[prev_idx] == in.x1 && block.w_o()[prev_idx] == in.y1;
+    } else {
+        prev_idx = block.size() - 1;
+        can_fuse_into_previous_gate =
+            block.size() > 0 && block.w_r()[prev_idx] == in.x1 && block.w_o()[prev_idx] == in.y1;
+    }
 
     if (can_fuse_into_previous_gate) {
-        block.q_1().set(block.size() - 1, q_sign);   // set q_sign of previous gate
-        block.q_elliptic().set(block.size() - 1, 1); // set q_ecc of previous gate to 1
+        block.q_1().set(prev_idx, q_sign);   // set q_sign of previous gate
+        block.q_elliptic().set(prev_idx, 1); // set q_ecc of previous gate to 1
     } else {
         block.populate_wires(this->zero_idx(), in.x1, in.y1, this->zero_idx());
         block.q_3().emplace_back(0);
@@ -427,16 +437,22 @@ void UltraCircuitBuilder_<ExecutionTrace>::create_ecc_dbl_gate(const ecc_dbl_gat
 
     auto& block = blocks.elliptic;
 
-    // Determine whether we can fuse this doubling operation into the previous gate in the block
-    bool can_fuse_into_previous_gate =
-        block.size() > 0 &&                       /* a previous gate exists in the block */
-        block.w_r()[block.size() - 1] == in.x1 && /* output x coord of previous gate is input of this one */
-        block.w_o()[block.size() - 1] == in.y1;   /* output y coord of previous gate is input of this one */
-
+    size_t dbl_cursor = block.wire_active_cursor();
+    bool can_fuse_into_previous_gate;
+    size_t dbl_prev_idx;
+    if (dbl_cursor != Selector<FF>::CURSOR_DISABLED) {
+        dbl_prev_idx = dbl_cursor - 1;
+        can_fuse_into_previous_gate =
+            dbl_cursor > 0 && block.w_r()[dbl_prev_idx] == in.x1 && block.w_o()[dbl_prev_idx] == in.y1;
+    } else {
+        dbl_prev_idx = block.size() - 1;
+        can_fuse_into_previous_gate =
+            block.size() > 0 && block.w_r()[dbl_prev_idx] == in.x1 && block.w_o()[dbl_prev_idx] == in.y1;
+    }
     // If possible, update the previous gate to be the first gate in the pair, otherwise create a new gate
     if (can_fuse_into_previous_gate) {
-        block.q_elliptic().set(block.size() - 1, 1); // set q_ecc of previous gate to 1
-        block.q_m().set(block.size() - 1, 1);        // set q_m (q_is_double) of previous gate to 1
+        block.q_elliptic().set(dbl_prev_idx, 1); // set q_ecc of previous gate to 1
+        block.q_m().set(dbl_prev_idx, 1);        // set q_m (q_is_double) of previous gate to 1
     } else {
         block.populate_wires(this->zero_idx(), in.x1, in.y1, this->zero_idx());
         block.q_m().emplace_back(1);
@@ -484,12 +500,18 @@ uint32_t UltraCircuitBuilder_<ExecutionTrace>::put_constant_variable(const FF& v
 {
     if (constant_variable_indices.contains(variable)) {
         return constant_variable_indices.at(variable);
-    } else {
+    }
+    // In cursor mode (parallel construction), don't insert into the shared cache.
+    // New constants that weren't pre-registered get fresh variables without deduplication.
+    if (this->get_variable_cursor() != this->VARIABLE_CURSOR_DISABLED) {
         uint32_t variable_index = this->add_variable(variable);
         fix_witness(variable_index, variable);
-        constant_variable_indices.insert({ variable, variable_index });
         return variable_index;
     }
+    uint32_t variable_index = this->add_variable(variable);
+    fix_witness(variable_index, variable);
+    constant_variable_indices.insert({ variable, variable_index });
+    return variable_index;
 }
 
 /**
@@ -559,7 +581,13 @@ plookup::ReadData<uint32_t> UltraCircuitBuilder_<ExecutionTrace>::create_gates_f
 
         // Get basic lookup table; construct and add to builder.lookup_tables if not already present
         plookup::BasicTable& table = get_table(multi_table.basic_table_ids[i]);
-        table.lookup_gates.emplace_back(read_values.lookup_entries[i]);
+        // In cursor mode, defer the lookup gate entry to avoid races on table.lookup_gates
+        if (this->get_variable_cursor() != this->VARIABLE_CURSOR_DISABLED) {
+            auto tidx = get_parallel_thread_index();
+            deferred_lookup_gates_.defer(tidx, { multi_table.basic_table_ids[i], read_values.lookup_entries[i] });
+        } else {
+            table.lookup_gates.emplace_back(read_values.lookup_entries[i]);
+        }
 
         // Create witness variables: first lookup reuses user's input indices, subsequent create new variables
         const auto first_idx = is_first_lookup ? key_a_index : this->add_variable(read_values[ColumnIdx::C1][i]);
@@ -756,6 +784,13 @@ void UltraCircuitBuilder_<ExecutionTrace>::create_small_range_constraint(const u
                                                                          const uint64_t target_range,
                                                                          std::string const msg)
 {
+    // In cursor mode, defer range constraint to avoid races on range_lists and real_variable_tags
+    if (this->get_variable_cursor() != this->VARIABLE_CURSOR_DISABLED) {
+        auto tidx = get_parallel_thread_index();
+        deferred_range_constraints_.defer(tidx, { variable_index, target_range });
+        return;
+    }
+
     // make sure `target_range` is not too big.
     BB_ASSERT_GTE(MAX_SMALL_RANGE_CONSTRAINT_VAL, target_range);
     const bool is_out_of_range = (uint256_t(this->get_variable(variable_index)).data[0] > target_range);
@@ -1522,7 +1557,8 @@ std::array<uint32_t, 2> UltraCircuitBuilder_<ExecutionTrace>::queue_partial_non_
     const uint32_t hi_0_idx = this->add_variable(hi_0);
     const uint32_t hi_1_idx = this->add_variable(hi_1);
 
-    // Add witnesses into the multiplication cache (duplicates removed during circuit finalization)
+    // Add witnesses into the multiplication cache (duplicates removed during circuit finalization).
+    // In cursor mode, defer to per-thread buffer to avoid races on the shared vector.
     cached_partial_non_native_field_multiplication cache_entry{
         .a = input.a,
         .b = input.b,
@@ -1530,7 +1566,11 @@ std::array<uint32_t, 2> UltraCircuitBuilder_<ExecutionTrace>::queue_partial_non_
         .hi_0 = hi_0_idx,
         .hi_1 = hi_1_idx,
     };
-    cached_partial_non_native_field_multiplications.emplace_back(cache_entry);
+    if (this->get_variable_cursor() != this->VARIABLE_CURSOR_DISABLED) {
+        deferred_non_native_field_muls_.defer(get_parallel_thread_index(), cache_entry);
+    } else {
+        cached_partial_non_native_field_multiplications.emplace_back(cache_entry);
+    }
     return std::array<uint32_t, 2>{ lo_0_idx, hi_1_idx };
 }
 
diff --git a/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/ultra_circuit_builder.hpp b/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/ultra_circuit_builder.hpp
index 2bcfae4938f7..be5c150980ae 100644
--- a/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/ultra_circuit_builder.hpp
+++ b/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/ultra_circuit_builder.hpp
@@ -14,7 +14,9 @@
 #include "circuit_builder_base.hpp"
 #include "rom_ram_logic.hpp"
 #include <deque>
+#include <functional>
 #include <optional>
+#include <thread>
 #include <unordered_set>
 
 #include "barretenberg/serialize/msgpack.hpp"
@@ -200,11 +202,224 @@ class UltraCircuitBuilder_ : public CircuitBuilderBase<typename ExecutionTrace_:
     // The set of variables which have been constrained to a particular value via an arithmetic gate
     std::unordered_map<FF, uint32_t> constant_variable_indices;
 
+    /**
+     * @brief Per-thread buffer for deferring operations during parallel construction.
+     * @details Operations that modify shared builder state are buffered per-thread during
+     * execute_parallel and replayed sequentially after all threads join. The replay callback
+     * receives each entry and applies it to the builder.
+     */
+    template <typename Entry> struct DeferredBuffer {
+        std::vector<std::vector<Entry>> thread_buffers;
+
+        void init(size_t num_threads) { thread_buffers.resize(num_threads); }
+
+        void defer(size_t thread_idx, Entry&& entry) { thread_buffers[thread_idx].emplace_back(std::move(entry)); }
+
+        void defer(size_t thread_idx, const Entry& entry) { thread_buffers[thread_idx].push_back(entry); }
+
+        template <typename Callback> void apply(Callback&& callback)
+        {
+            for (auto& buf : thread_buffers) {
+                for (auto& entry : buf) {
+                    callback(entry);
+                }
+                buf.clear();
+            }
+        }
+    };
+
+    struct DeferredLookupEntry {
+        plookup::BasicTableId table_id;
+        plookup::BasicTable::LookupEntry entry;
+    };
+    struct DeferredRangeConstraint {
+        uint32_t variable_index;
+        uint64_t target_range;
+    };
+
+    DeferredBuffer<DeferredLookupEntry> deferred_lookup_gates_;
+    DeferredBuffer<DeferredRangeConstraint> deferred_range_constraints_;
+    DeferredBuffer<cached_partial_non_native_field_multiplication> deferred_non_native_field_muls_;
+
+    void init_deferred_buffers(size_t num_threads)
+    {
+        deferred_lookup_gates_.init(num_threads);
+        deferred_range_constraints_.init(num_threads);
+        deferred_non_native_field_muls_.init(num_threads);
+    }
+
+    /**
+     * @brief Per-block gate counts and variable count for a task (one or more opcodes).
+     */
+    struct TaskBlockSizes {
+        std::array<size_t, ExecutionTrace::NUM_BLOCKS> block_sizes{};
+        size_t num_variables = 0;
+        size_t num_rom_arrays = 0;
+        size_t num_ram_arrays = 0;
+    };
+
+    /**
+     * @brief Snapshot the current block sizes and variable count.
+     */
+    TaskBlockSizes snapshot_block_sizes() const
+    {
+        TaskBlockSizes s;
+        auto block_refs = blocks.get();
+        for (size_t i = 0; i < ExecutionTrace::NUM_BLOCKS; i++) {
+            s.block_sizes[i] = block_refs[i].size();
+        }
+        s.num_variables = this->get_num_variables();
+        return s;
+    }
+
+    /**
+     * @brief Compute the delta between two snapshots (after - before).
+     */
+    static TaskBlockSizes delta(const TaskBlockSizes& before, const TaskBlockSizes& after)
+    {
+        TaskBlockSizes d;
+        for (size_t i = 0; i < ExecutionTrace::NUM_BLOCKS; i++) {
+            d.block_sizes[i] = after.block_sizes[i] - before.block_sizes[i];
+        }
+        d.num_variables = after.num_variables - before.num_variables;
+        return d;
+    }
+
+    /**
+     * @brief Execute tasks in parallel on this builder. Each task is a lambda that adds gates to the builder.
+     * @details Pre-allocates blocks and variables based on per-task sizes, then dispatches tasks across threads
+     * with per-thread cursors. After joining, replays deferred lookup and range constraint operations.
+     *
+     * @param tasks Vector of lambdas, each taking (UltraCircuitBuilder_&) and adding gates
+     * @param task_sizes Per-task block sizes and variable counts (must match tasks.size())
+     * @param num_threads Number of threads to use (tasks are distributed round-robin)
+     */
+    void execute_parallel(const std::vector<std::function<void(UltraCircuitBuilder_&)>>& tasks,
+                          const std::vector<TaskBlockSizes>& task_sizes,
+                          size_t num_threads)
+    {
+        BB_ASSERT(tasks.size() == task_sizes.size());
+        if (tasks.empty()) {
+            return;
+        }
+        num_threads = std::min(num_threads, tasks.size());
+
+        // Compute total sizes and per-task offsets
+        auto base = snapshot_block_sizes();
+        std::vector<TaskBlockSizes> offsets(tasks.size());
+        TaskBlockSizes running = base;
+        for (size_t t = 0; t < tasks.size(); t++) {
+            offsets[t] = running;
+            for (size_t b = 0; b < ExecutionTrace::NUM_BLOCKS; b++) {
+                running.block_sizes[b] += task_sizes[t].block_sizes[b];
+            }
+            running.num_variables += task_sizes[t].num_variables;
+            running.num_rom_arrays += task_sizes[t].num_rom_arrays;
+            running.num_ram_arrays += task_sizes[t].num_ram_arrays;
+        }
+
+        // Pre-allocate all blocks and variables to total size
+        auto block_refs = blocks.get();
+        for (size_t b = 0; b < ExecutionTrace::NUM_BLOCKS; b++) {
+            auto& block = block_refs[b];
+            for (auto& wire : block.wires) {
+                wire.resize(running.block_sizes[b], 0);
+            }
+            for (auto& sel : block.get_selectors()) {
+                sel.resize(running.block_sizes[b]);
+            }
+        }
+        this->resize_variables(running.num_variables);
+        init_deferred_buffers(num_threads);
+        this->init_deferred_assert_equal_buffers(tasks.size());
+
+        // Assign tasks to threads (round-robin)
+        std::vector<std::vector<size_t>> thread_tasks(num_threads);
+        for (size_t t = 0; t < tasks.size(); t++) {
+            thread_tasks[t % num_threads].push_back(t);
+        }
+
+        // Pre-initialize all cursors on the main thread to avoid races on cursor vector resizing.
+        // For threads with multiple tasks, we set the cursor to the first task's offset;
+        // within the thread, cursors are updated sequentially between tasks.
+        auto block_refs_setup = blocks.get();
+        for (size_t tid = 0; tid < num_threads; tid++) {
+            if (!thread_tasks[tid].empty()) {
+                size_t first_task = thread_tasks[tid][0];
+                // Enable cursors for ALL blocks, not just ones the first task uses.
+                // Later tasks on this thread may use blocks the first task doesn't.
+                for (size_t b = 0; b < ExecutionTrace::NUM_BLOCKS; b++) {
+                    block_refs_setup[b].enable_cursor_mode(tid, offsets[first_task].block_sizes[b]);
+                }
+                this->enable_variable_cursor(tid, static_cast<uint32_t>(offsets[first_task].num_variables));
+                rom_ram_logic.enable_rom_cursor(tid, offsets[first_task].num_rom_arrays);
+                rom_ram_logic.enable_ram_cursor(tid, offsets[first_task].num_ram_arrays);
+            }
+        }
+
+        // Dispatch threads
+        std::vector<std::thread> threads;
+        threads.reserve(num_threads);
+
+        for (size_t tid = 0; tid < num_threads; tid++) {
+            threads.emplace_back([this, tid, &tasks, &offsets, &thread_tasks]() {
+                set_parallel_thread_index(tid);
+
+                for (size_t i = 0; i < thread_tasks[tid].size(); i++) {
+                    size_t task_idx = thread_tasks[tid][i];
+
+                    // For subsequent tasks (not the first), update cursors to this task's offsets
+                    if (i > 0) {
+                        auto block_refs_local = blocks.get();
+                        for (size_t b = 0; b < ExecutionTrace::NUM_BLOCKS; b++) {
+                            block_refs_local[b].enable_cursor_mode(tid, offsets[task_idx].block_sizes[b]);
+                        }
+                        this->enable_variable_cursor(tid, static_cast<uint32_t>(offsets[task_idx].num_variables));
+                        rom_ram_logic.enable_rom_cursor(tid, offsets[task_idx].num_rom_arrays);
+                        rom_ram_logic.enable_ram_cursor(tid, offsets[task_idx].num_ram_arrays);
+                    }
+
+                    // Execute the task
+                    this->set_current_task_index(task_idx);
+                    tasks[task_idx](*this);
+                }
+
+                // Disable all cursors for this thread
+                auto block_refs_local = blocks.get();
+                for (size_t b = 0; b < ExecutionTrace::NUM_BLOCKS; b++) {
+                    block_refs_local[b].disable_cursor_mode(tid);
+                }
+                this->disable_variable_cursor(tid);
+            });
+        }
+
+        // Join all threads
+        for (auto& t : threads) {
+            t.join();
+        }
+
+        // Clear ROM/RAM cursors so subsequent sequential operations use normal path
+        rom_ram_logic.rom_id_cursors_.clear();
+        rom_ram_logic.ram_id_cursors_.clear();
+
+        // Replay deferred operations
+        deferred_lookup_gates_.apply([this](auto& e) {
+            auto& table = get_table(e.table_id);
+            table.lookup_gates.emplace_back(e.entry);
+        });
+        deferred_range_constraints_.apply(
+            [this](auto& e) { create_small_range_constraint(e.variable_index, e.target_range); });
+        deferred_non_native_field_muls_.apply(
+            [this](auto& e) { cached_partial_non_native_field_multiplications.emplace_back(e); });
+        this->apply_deferred_assert_equals();
+    }
+
     // Rom/Ram logic
     RomRamLogic rom_ram_logic;
 
     // Stores gate index of ROM/RAM reads (required by proving key)
     std::vector<uint32_t> memory_read_records;
+
     // Stores gate index of RAM writes (required by proving key)
     std::vector<uint32_t> memory_write_records;
     // Range constraints to be batched, keyed by target_range. See create_small_range_constraint() for details.
@@ -635,7 +850,14 @@ class UltraCircuitBuilder_ : public CircuitBuilderBase<typename ExecutionTrace_:
      * gate However, there are some cases where we want to exclude certain variables from this detection (for example,
      * when we show that x!=0 -> x*(x^-1) = 1).
      */
-    void update_used_witnesses(uint32_t var_idx) { used_witnesses.emplace_back(var_idx); }
+    void update_used_witnesses(uint32_t var_idx)
+    {
+        // Skip in cursor mode to avoid races on shared used_witnesses vector
+        if (this->get_variable_cursor() != this->VARIABLE_CURSOR_DISABLED) {
+            return;
+        }
+        used_witnesses.emplace_back(var_idx);
+    }
 
     /**
      * @brief Add a list of witness indices to the boomerang exclusion list
@@ -646,6 +868,10 @@ class UltraCircuitBuilder_ : public CircuitBuilderBase<typename ExecutionTrace_:
      */
     void update_used_witnesses(const std::vector<uint32_t>& used_indices)
     {
+        // Skip in cursor mode to avoid races on shared used_witnesses vector
+        if (this->get_variable_cursor() != this->VARIABLE_CURSOR_DISABLED) {
+            return;
+        }
         used_witnesses.reserve(used_witnesses.size() + used_indices.size());
         for (const auto& it : used_indices) {
             used_witnesses.emplace_back(it);
@@ -659,7 +885,13 @@ class UltraCircuitBuilder_ : public CircuitBuilderBase<typename ExecutionTrace_:
      * circuit are all connected. However, during finalization we intentionally create some subcircuits that are only
      * connected through the set permutation. We want to exclude these variables from this detection.
      */
-    void update_finalize_witnesses(uint32_t var_idx) { finalize_witnesses.insert(var_idx); }
+    void update_finalize_witnesses(uint32_t var_idx)
+    {
+        if (this->get_variable_cursor() != this->VARIABLE_CURSOR_DISABLED) {
+            return;
+        }
+        finalize_witnesses.insert(var_idx);
+    }
 
     /**
      * @brief Add a list of witness indices to the finalize exclusion list
@@ -670,6 +902,9 @@ class UltraCircuitBuilder_ : public CircuitBuilderBase<typename ExecutionTrace_:
      */
     void update_finalize_witnesses(const std::vector<uint32_t>& finalize_indices)
     {
+        if (this->get_variable_cursor() != this->VARIABLE_CURSOR_DISABLED) {
+            return;
+        }
         for (const auto& it : finalize_indices) {
             finalize_witnesses.insert(it);
         }