Skip to content

Commit ce2ec0f

Browse files
committed
blake, poseidon, EC + deferred copy constraints + stronger semantic equivalence checks
1 parent 81b11c6 commit ce2ec0f

7 files changed

Lines changed: 820 additions & 79 deletions

File tree

barretenberg/cpp/src/barretenberg/dsl/acir_format/acir_format.cpp

Lines changed: 143 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -278,44 +278,53 @@ ConstraintProfile profile_constraint_type(ConstraintType representative, Handler
278278
{
279279
ConstraintProfile profile;
280280

281-
// Create a throwaway builder with enough witness slots
281+
// Phase A: Run one instance on a throwaway builder to discover setup needs (constants, range lists, etc.)
282282
WitnessVector dummy_witness(num_witnesses, bb::fr(0));
283-
UltraCircuitBuilder throwaway{ dummy_witness, {}, false };
283+
UltraCircuitBuilder warmup_builder{ dummy_witness, {}, /*is_write_vk_mode=*/true };
284+
handler(warmup_builder, representative);
284285

285-
// First instance: triggers one-time setup
286-
handler(throwaway, representative);
287-
288-
// Second instance: measures steady-state cost
289-
auto before = throwaway.snapshot_block_sizes();
290-
size_t rom_before = throwaway.rom_ram_logic.rom_arrays.size();
291-
size_t ram_before = throwaway.rom_ram_logic.ram_arrays.size();
292-
handler(throwaway, representative);
293-
auto after = throwaway.snapshot_block_sizes();
294-
profile.block_sizes = UltraCircuitBuilder::delta(before, after);
295-
296-
// Extract ROM/RAM array counts per instance
297-
profile.num_rom_arrays_per_instance = throwaway.rom_ram_logic.rom_arrays.size() - rom_before;
298-
profile.num_ram_arrays_per_instance = throwaway.rom_ram_logic.ram_arrays.size() - ram_before;
299-
for (size_t i = rom_before; i < throwaway.rom_ram_logic.rom_arrays.size(); i++) {
300-
profile.rom_array_sizes.push_back(throwaway.rom_ram_logic.rom_arrays[i].state.size());
286+
// Extract setup data from the warmup builder
287+
for (const auto& [value, _] : warmup_builder.constant_variable_indices) {
288+
profile.constants.push_back(value);
301289
}
302-
for (size_t i = ram_before; i < throwaway.rom_ram_logic.ram_arrays.size(); i++) {
303-
profile.ram_array_sizes.push_back(throwaway.rom_ram_logic.ram_arrays[i].state.size());
290+
for (const auto& [target_range, _] : warmup_builder.range_lists) {
291+
profile.range_list_targets.push_back(target_range);
304292
}
305-
306-
// Extract constants
307-
for (const auto& [value, _] : throwaway.constant_variable_indices) {
308-
profile.constants.push_back(value);
293+
for (const auto& table : warmup_builder.get_lookup_tables()) {
294+
profile.table_ids.push_back(table.id);
309295
}
310296

311-
// Extract range list targets
312-
for (const auto& [target_range, _] : throwaway.range_lists) {
313-
profile.range_list_targets.push_back(target_range);
297+
// Phase B: Measure steady-state cost on a SEPARATE builder pre-populated with setup data.
298+
// This ensures no cross-instance gate fusion at the boundary, matching cursor-mode behavior
299+
// where each task starts with no prior gates in its block region.
300+
UltraCircuitBuilder measure_builder{ WitnessVector(dummy_witness), {}, /*is_write_vk_mode=*/true };
301+
for (const auto& value : profile.constants) {
302+
measure_builder.put_constant_variable(value);
303+
}
304+
for (const auto target_range : profile.range_list_targets) {
305+
if (measure_builder.range_lists.count(target_range) == 0) {
306+
measure_builder.range_lists.insert({ target_range, measure_builder.create_range_list(target_range) });
307+
}
308+
}
309+
for (const auto table_id : profile.table_ids) {
310+
measure_builder.get_table(table_id);
314311
}
315312

316-
// Extract lookup table IDs
317-
for (const auto& table : throwaway.get_lookup_tables()) {
318-
profile.table_ids.push_back(table.id);
313+
auto before = measure_builder.snapshot_block_sizes();
314+
size_t rom_before = measure_builder.rom_ram_logic.rom_arrays.size();
315+
size_t ram_before = measure_builder.rom_ram_logic.ram_arrays.size();
316+
handler(measure_builder, representative);
317+
auto after = measure_builder.snapshot_block_sizes();
318+
profile.block_sizes = UltraCircuitBuilder::delta(before, after);
319+
320+
// Extract ROM/RAM array counts per instance
321+
profile.num_rom_arrays_per_instance = measure_builder.rom_ram_logic.rom_arrays.size() - rom_before;
322+
profile.num_ram_arrays_per_instance = measure_builder.rom_ram_logic.ram_arrays.size() - ram_before;
323+
for (size_t i = rom_before; i < measure_builder.rom_ram_logic.rom_arrays.size(); i++) {
324+
profile.rom_array_sizes.push_back(measure_builder.rom_ram_logic.rom_arrays[i].state.size());
325+
}
326+
for (size_t i = ram_before; i < measure_builder.rom_ram_logic.ram_arrays.size(); i++) {
327+
profile.ram_array_sizes.push_back(measure_builder.rom_ram_logic.ram_arrays[i].state.size());
319328
}
320329

321330
return profile;
@@ -510,19 +519,110 @@ void build_constraints_parallel(UltraCircuitBuilder& builder,
510519
profile_and_collect(constraints.ecdsa_r1_constraints, [](UltraCircuitBuilder& b, const EcdsaConstraint& c) {
511520
create_ecdsa_verify_constraints<stdlib::secp256r1<UltraCircuitBuilder>>(b, c);
512521
});
513-
profile_and_collect(constraints.blake2s_constraints,
514-
[](UltraCircuitBuilder& b, const Blake2sConstraint& c) { create_blake2s_constraints(b, c); });
515-
profile_and_collect(constraints.blake3_constraints,
516-
[](UltraCircuitBuilder& b, const Blake3Constraint& c) { create_blake3_constraints(b, c); });
522+
// Blake2s constraints must be grouped by inputs.size() since different input lengths produce different gate counts.
523+
{
524+
std::map<size_t, std::vector<size_t>> blake2s_groups;
525+
for (size_t i = 0; i < constraints.blake2s_constraints.size(); i++) {
526+
blake2s_groups[constraints.blake2s_constraints[i].inputs.size()].push_back(i);
527+
}
528+
auto handler = [](UltraCircuitBuilder& b, const Blake2sConstraint& c) { create_blake2s_constraints(b, c); };
529+
for (auto& [sz, indices] : blake2s_groups) {
530+
auto& representative = constraints.blake2s_constraints[indices[0]];
531+
auto profile = profile_constraint_type(representative, handler, num_witnesses);
532+
size_t profile_idx = profiles.size();
533+
profiles.push_back(profile);
534+
auto sizes = profile.block_sizes;
535+
sizes.num_rom_arrays = profile.num_rom_arrays_per_instance;
536+
sizes.num_ram_arrays = profile.num_ram_arrays_per_instance;
537+
for (size_t idx : indices) {
538+
tasks.emplace_back([handler, &constraints, idx](UltraCircuitBuilder& b) {
539+
handler(b, constraints.blake2s_constraints[idx]);
540+
});
541+
task_sizes.push_back(sizes);
542+
task_profile_indices.push_back(profile_idx);
543+
}
544+
}
545+
}
546+
// Blake3 constraints must be grouped by inputs.size() since different input lengths produce different gate counts.
547+
{
548+
std::map<size_t, std::vector<size_t>> blake3_groups;
549+
for (size_t i = 0; i < constraints.blake3_constraints.size(); i++) {
550+
blake3_groups[constraints.blake3_constraints[i].inputs.size()].push_back(i);
551+
}
552+
auto handler = [](UltraCircuitBuilder& b, const Blake3Constraint& c) { create_blake3_constraints(b, c); };
553+
for (auto& [sz, indices] : blake3_groups) {
554+
auto& representative = constraints.blake3_constraints[indices[0]];
555+
auto profile = profile_constraint_type(representative, handler, num_witnesses);
556+
size_t profile_idx = profiles.size();
557+
profiles.push_back(profile);
558+
auto sizes = profile.block_sizes;
559+
sizes.num_rom_arrays = profile.num_rom_arrays_per_instance;
560+
sizes.num_ram_arrays = profile.num_ram_arrays_per_instance;
561+
for (size_t idx : indices) {
562+
tasks.emplace_back([handler, &constraints, idx](UltraCircuitBuilder& b) {
563+
handler(b, constraints.blake3_constraints[idx]);
564+
});
565+
task_sizes.push_back(sizes);
566+
task_profile_indices.push_back(profile_idx);
567+
}
568+
}
569+
}
517570
profile_and_collect(constraints.keccak_permutations, [](UltraCircuitBuilder& b, const Keccakf1600& c) {
518571
create_keccak_permutations_constraints(b, c);
519572
});
520-
profile_and_collect(constraints.poseidon2_constraints, [](UltraCircuitBuilder& b, const Poseidon2Constraint& c) {
521-
create_poseidon2_permutations_constraints(b, c);
522-
});
523-
profile_and_collect(constraints.multi_scalar_mul_constraints, [](UltraCircuitBuilder& b, const MultiScalarMul& c) {
524-
create_multi_scalar_mul_constraint(b, c);
525-
});
573+
// Poseidon2 constraints must be grouped by state.size() since different widths produce different gate counts.
574+
{
575+
std::map<size_t, std::vector<size_t>> pos2_groups;
576+
for (size_t i = 0; i < constraints.poseidon2_constraints.size(); i++) {
577+
pos2_groups[constraints.poseidon2_constraints[i].state.size()].push_back(i);
578+
}
579+
auto handler = [](UltraCircuitBuilder& b, const Poseidon2Constraint& c) {
580+
create_poseidon2_permutations_constraints(b, c);
581+
};
582+
for (auto& [sz, indices] : pos2_groups) {
583+
auto& representative = constraints.poseidon2_constraints[indices[0]];
584+
auto profile = profile_constraint_type(representative, handler, num_witnesses);
585+
size_t profile_idx = profiles.size();
586+
profiles.push_back(profile);
587+
auto sizes = profile.block_sizes;
588+
sizes.num_rom_arrays = profile.num_rom_arrays_per_instance;
589+
sizes.num_ram_arrays = profile.num_ram_arrays_per_instance;
590+
for (size_t idx : indices) {
591+
tasks.emplace_back([handler, &constraints, idx](UltraCircuitBuilder& b) {
592+
handler(b, constraints.poseidon2_constraints[idx]);
593+
});
594+
task_sizes.push_back(sizes);
595+
task_profile_indices.push_back(profile_idx);
596+
}
597+
}
598+
}
599+
// MultiScalarMul constraints must be grouped by points.size() since different point counts
600+
// produce different gate counts and different numbers of ROM arrays.
601+
{
602+
std::map<size_t, std::vector<size_t>> msm_groups;
603+
for (size_t i = 0; i < constraints.multi_scalar_mul_constraints.size(); i++) {
604+
msm_groups[constraints.multi_scalar_mul_constraints[i].points.size()].push_back(i);
605+
}
606+
auto handler = [](UltraCircuitBuilder& b, const MultiScalarMul& c) {
607+
create_multi_scalar_mul_constraint(b, c);
608+
};
609+
for (auto& [sz, indices] : msm_groups) {
610+
auto& representative = constraints.multi_scalar_mul_constraints[indices[0]];
611+
auto profile = profile_constraint_type(representative, handler, num_witnesses);
612+
size_t profile_idx = profiles.size();
613+
profiles.push_back(profile);
614+
auto sizes = profile.block_sizes;
615+
sizes.num_rom_arrays = profile.num_rom_arrays_per_instance;
616+
sizes.num_ram_arrays = profile.num_ram_arrays_per_instance;
617+
for (size_t idx : indices) {
618+
tasks.emplace_back([handler, &constraints, idx](UltraCircuitBuilder& b) {
619+
handler(b, constraints.multi_scalar_mul_constraints[idx]);
620+
});
621+
task_sizes.push_back(sizes);
622+
task_profile_indices.push_back(profile_idx);
623+
}
624+
}
625+
}
526626
profile_and_collect(constraints.ec_add_constraints,
527627
[](UltraCircuitBuilder& b, const EcAdd& c) { create_ec_add_constraint(b, c); });
528628

@@ -543,11 +643,14 @@ void build_constraints_parallel(UltraCircuitBuilder& builder,
543643
}
544644
}
545645

646+
info(" Phase 3: executing ", tasks.size(), " tasks with ", num_threads, " threads");
647+
546648
// Phase 3: Execute ALL instances in parallel
547649
// execute_parallel will set up per-thread ROM/RAM cursors using the num_rom/ram_arrays in task_sizes
548650
if (!tasks.empty()) {
549651
builder.execute_parallel(tasks, task_sizes, num_threads);
550652
}
653+
info(" Phase 3: done");
551654

552655
// Phase 4: Block constraints and recursion constraints are processed sequentially.
553656
for (const auto& [constraint, opcode_indices] :

0 commit comments

Comments
 (0)