@@ -278,44 +278,53 @@ ConstraintProfile profile_constraint_type(ConstraintType representative, Handler
278278{
279279 ConstraintProfile profile;
280280
281- // Create a throwaway builder with enough witness slots
281+ // Phase A: Run one instance on a throwaway builder to discover setup needs (constants, range lists, etc.)
282282 WitnessVector dummy_witness (num_witnesses, bb::fr (0 ));
283- UltraCircuitBuilder throwaway{ dummy_witness, {}, false };
283+ UltraCircuitBuilder warmup_builder{ dummy_witness, {}, /* is_write_vk_mode=*/ true };
284+ handler (warmup_builder, representative);
284285
285- // First instance: triggers one-time setup
286- handler (throwaway, representative);
287-
288- // Second instance: measures steady-state cost
289- auto before = throwaway.snapshot_block_sizes ();
290- size_t rom_before = throwaway.rom_ram_logic .rom_arrays .size ();
291- size_t ram_before = throwaway.rom_ram_logic .ram_arrays .size ();
292- handler (throwaway, representative);
293- auto after = throwaway.snapshot_block_sizes ();
294- profile.block_sizes = UltraCircuitBuilder::delta (before, after);
295-
296- // Extract ROM/RAM array counts per instance
297- profile.num_rom_arrays_per_instance = throwaway.rom_ram_logic .rom_arrays .size () - rom_before;
298- profile.num_ram_arrays_per_instance = throwaway.rom_ram_logic .ram_arrays .size () - ram_before;
299- for (size_t i = rom_before; i < throwaway.rom_ram_logic .rom_arrays .size (); i++) {
300- profile.rom_array_sizes .push_back (throwaway.rom_ram_logic .rom_arrays [i].state .size ());
286+ // Extract setup data from the warmup builder
287+ for (const auto & [value, _] : warmup_builder.constant_variable_indices ) {
288+ profile.constants .push_back (value);
301289 }
302- for (size_t i = ram_before; i < throwaway. rom_ram_logic . ram_arrays . size (); i++ ) {
303- profile.ram_array_sizes .push_back (throwaway. rom_ram_logic . ram_arrays [i]. state . size () );
290+ for (const auto & [target_range, _] : warmup_builder. range_lists ) {
291+ profile.range_list_targets .push_back (target_range );
304292 }
305-
306- // Extract constants
307- for (const auto & [value, _] : throwaway.constant_variable_indices ) {
308- profile.constants .push_back (value);
293+ for (const auto & table : warmup_builder.get_lookup_tables ()) {
294+ profile.table_ids .push_back (table.id );
309295 }
310296
311- // Extract range list targets
312- for (const auto & [target_range, _] : throwaway.range_lists ) {
313- profile.range_list_targets .push_back (target_range);
297+ // Phase B: Measure steady-state cost on a SEPARATE builder pre-populated with setup data.
298+ // This ensures no cross-instance gate fusion at the boundary, matching cursor-mode behavior
299+ // where each task starts with no prior gates in its block region.
300+ UltraCircuitBuilder measure_builder{ WitnessVector (dummy_witness), {}, /* is_write_vk_mode=*/ true };
301+ for (const auto & value : profile.constants ) {
302+ measure_builder.put_constant_variable (value);
303+ }
304+ for (const auto target_range : profile.range_list_targets ) {
305+ if (measure_builder.range_lists .count (target_range) == 0 ) {
306+ measure_builder.range_lists .insert ({ target_range, measure_builder.create_range_list (target_range) });
307+ }
308+ }
309+ for (const auto table_id : profile.table_ids ) {
310+ measure_builder.get_table (table_id);
314311 }
315312
316- // Extract lookup table IDs
317- for (const auto & table : throwaway.get_lookup_tables ()) {
318- profile.table_ids .push_back (table.id );
313+ auto before = measure_builder.snapshot_block_sizes ();
314+ size_t rom_before = measure_builder.rom_ram_logic .rom_arrays .size ();
315+ size_t ram_before = measure_builder.rom_ram_logic .ram_arrays .size ();
316+ handler (measure_builder, representative);
317+ auto after = measure_builder.snapshot_block_sizes ();
318+ profile.block_sizes = UltraCircuitBuilder::delta (before, after);
319+
320+ // Extract ROM/RAM array counts per instance
321+ profile.num_rom_arrays_per_instance = measure_builder.rom_ram_logic .rom_arrays .size () - rom_before;
322+ profile.num_ram_arrays_per_instance = measure_builder.rom_ram_logic .ram_arrays .size () - ram_before;
323+ for (size_t i = rom_before; i < measure_builder.rom_ram_logic .rom_arrays .size (); i++) {
324+ profile.rom_array_sizes .push_back (measure_builder.rom_ram_logic .rom_arrays [i].state .size ());
325+ }
326+ for (size_t i = ram_before; i < measure_builder.rom_ram_logic .ram_arrays .size (); i++) {
327+ profile.ram_array_sizes .push_back (measure_builder.rom_ram_logic .ram_arrays [i].state .size ());
319328 }
320329
321330 return profile;
@@ -510,19 +519,110 @@ void build_constraints_parallel(UltraCircuitBuilder& builder,
510519 profile_and_collect (constraints.ecdsa_r1_constraints , [](UltraCircuitBuilder& b, const EcdsaConstraint& c) {
511520 create_ecdsa_verify_constraints<stdlib::secp256r1<UltraCircuitBuilder>>(b, c);
512521 });
513- profile_and_collect (constraints.blake2s_constraints ,
514- [](UltraCircuitBuilder& b, const Blake2sConstraint& c) { create_blake2s_constraints (b, c); });
515- profile_and_collect (constraints.blake3_constraints ,
516- [](UltraCircuitBuilder& b, const Blake3Constraint& c) { create_blake3_constraints (b, c); });
522+ // Blake2s constraints must be grouped by inputs.size() since different input lengths produce different gate counts.
523+ {
524+ std::map<size_t , std::vector<size_t >> blake2s_groups;
525+ for (size_t i = 0 ; i < constraints.blake2s_constraints .size (); i++) {
526+ blake2s_groups[constraints.blake2s_constraints [i].inputs .size ()].push_back (i);
527+ }
528+ auto handler = [](UltraCircuitBuilder& b, const Blake2sConstraint& c) { create_blake2s_constraints (b, c); };
529+ for (auto & [sz, indices] : blake2s_groups) {
530+ auto & representative = constraints.blake2s_constraints [indices[0 ]];
531+ auto profile = profile_constraint_type (representative, handler, num_witnesses);
532+ size_t profile_idx = profiles.size ();
533+ profiles.push_back (profile);
534+ auto sizes = profile.block_sizes ;
535+ sizes.num_rom_arrays = profile.num_rom_arrays_per_instance ;
536+ sizes.num_ram_arrays = profile.num_ram_arrays_per_instance ;
537+ for (size_t idx : indices) {
538+ tasks.emplace_back ([handler, &constraints, idx](UltraCircuitBuilder& b) {
539+ handler (b, constraints.blake2s_constraints [idx]);
540+ });
541+ task_sizes.push_back (sizes);
542+ task_profile_indices.push_back (profile_idx);
543+ }
544+ }
545+ }
546+ // Blake3 constraints must be grouped by inputs.size() since different input lengths produce different gate counts.
547+ {
548+ std::map<size_t , std::vector<size_t >> blake3_groups;
549+ for (size_t i = 0 ; i < constraints.blake3_constraints .size (); i++) {
550+ blake3_groups[constraints.blake3_constraints [i].inputs .size ()].push_back (i);
551+ }
552+ auto handler = [](UltraCircuitBuilder& b, const Blake3Constraint& c) { create_blake3_constraints (b, c); };
553+ for (auto & [sz, indices] : blake3_groups) {
554+ auto & representative = constraints.blake3_constraints [indices[0 ]];
555+ auto profile = profile_constraint_type (representative, handler, num_witnesses);
556+ size_t profile_idx = profiles.size ();
557+ profiles.push_back (profile);
558+ auto sizes = profile.block_sizes ;
559+ sizes.num_rom_arrays = profile.num_rom_arrays_per_instance ;
560+ sizes.num_ram_arrays = profile.num_ram_arrays_per_instance ;
561+ for (size_t idx : indices) {
562+ tasks.emplace_back ([handler, &constraints, idx](UltraCircuitBuilder& b) {
563+ handler (b, constraints.blake3_constraints [idx]);
564+ });
565+ task_sizes.push_back (sizes);
566+ task_profile_indices.push_back (profile_idx);
567+ }
568+ }
569+ }
517570 profile_and_collect (constraints.keccak_permutations , [](UltraCircuitBuilder& b, const Keccakf1600& c) {
518571 create_keccak_permutations_constraints (b, c);
519572 });
520- profile_and_collect (constraints.poseidon2_constraints , [](UltraCircuitBuilder& b, const Poseidon2Constraint& c) {
521- create_poseidon2_permutations_constraints (b, c);
522- });
523- profile_and_collect (constraints.multi_scalar_mul_constraints , [](UltraCircuitBuilder& b, const MultiScalarMul& c) {
524- create_multi_scalar_mul_constraint (b, c);
525- });
573+ // Poseidon2 constraints must be grouped by state.size() since different widths produce different gate counts.
574+ {
575+ std::map<size_t , std::vector<size_t >> pos2_groups;
576+ for (size_t i = 0 ; i < constraints.poseidon2_constraints .size (); i++) {
577+ pos2_groups[constraints.poseidon2_constraints [i].state .size ()].push_back (i);
578+ }
579+ auto handler = [](UltraCircuitBuilder& b, const Poseidon2Constraint& c) {
580+ create_poseidon2_permutations_constraints (b, c);
581+ };
582+ for (auto & [sz, indices] : pos2_groups) {
583+ auto & representative = constraints.poseidon2_constraints [indices[0 ]];
584+ auto profile = profile_constraint_type (representative, handler, num_witnesses);
585+ size_t profile_idx = profiles.size ();
586+ profiles.push_back (profile);
587+ auto sizes = profile.block_sizes ;
588+ sizes.num_rom_arrays = profile.num_rom_arrays_per_instance ;
589+ sizes.num_ram_arrays = profile.num_ram_arrays_per_instance ;
590+ for (size_t idx : indices) {
591+ tasks.emplace_back ([handler, &constraints, idx](UltraCircuitBuilder& b) {
592+ handler (b, constraints.poseidon2_constraints [idx]);
593+ });
594+ task_sizes.push_back (sizes);
595+ task_profile_indices.push_back (profile_idx);
596+ }
597+ }
598+ }
599+ // MultiScalarMul constraints must be grouped by points.size() since different point counts
600+ // produce different gate counts and different numbers of ROM arrays.
601+ {
602+ std::map<size_t , std::vector<size_t >> msm_groups;
603+ for (size_t i = 0 ; i < constraints.multi_scalar_mul_constraints .size (); i++) {
604+ msm_groups[constraints.multi_scalar_mul_constraints [i].points .size ()].push_back (i);
605+ }
606+ auto handler = [](UltraCircuitBuilder& b, const MultiScalarMul& c) {
607+ create_multi_scalar_mul_constraint (b, c);
608+ };
609+ for (auto & [sz, indices] : msm_groups) {
610+ auto & representative = constraints.multi_scalar_mul_constraints [indices[0 ]];
611+ auto profile = profile_constraint_type (representative, handler, num_witnesses);
612+ size_t profile_idx = profiles.size ();
613+ profiles.push_back (profile);
614+ auto sizes = profile.block_sizes ;
615+ sizes.num_rom_arrays = profile.num_rom_arrays_per_instance ;
616+ sizes.num_ram_arrays = profile.num_ram_arrays_per_instance ;
617+ for (size_t idx : indices) {
618+ tasks.emplace_back ([handler, &constraints, idx](UltraCircuitBuilder& b) {
619+ handler (b, constraints.multi_scalar_mul_constraints [idx]);
620+ });
621+ task_sizes.push_back (sizes);
622+ task_profile_indices.push_back (profile_idx);
623+ }
624+ }
625+ }
526626 profile_and_collect (constraints.ec_add_constraints ,
527627 [](UltraCircuitBuilder& b, const EcAdd& c) { create_ec_add_constraint (b, c); });
528628
@@ -543,11 +643,14 @@ void build_constraints_parallel(UltraCircuitBuilder& builder,
543643 }
544644 }
545645
646+ info (" Phase 3: executing " , tasks.size (), " tasks with " , num_threads, " threads" );
647+
546648 // Phase 3: Execute ALL instances in parallel
547649 // execute_parallel will set up per-thread ROM/RAM cursors using the num_rom/ram_arrays in task_sizes
548650 if (!tasks.empty ()) {
549651 builder.execute_parallel (tasks, task_sizes, num_threads);
550652 }
653+ info (" Phase 3: done" );
551654
552655 // Phase 4: Block constraints and recursion constraints are processed sequentially.
553656 for (const auto & [constraint, opcode_indices] :
0 commit comments