@@ -252,42 +252,88 @@ template void build_constraints<UltraCircuitBuilder>(UltraCircuitBuilder&, AcirF
252252template void build_constraints<MegaCircuitBuilder>(MegaCircuitBuilder&, AcirFormat&, const ProgramMetadata&);
253253
254254/* *
255- * @brief Helper: run the first two instances of a constraint type as warmup, measure the steady-state
256- * per-instance size from the second, and collect remaining instances as tasks for parallel execution.
257- * @details The first instance triggers one-time setup (range list creation, lookup table creation, etc.)
258- * and is inflated. The second instance represents steady-state cost. If only one instance exists,
259- * it's processed during warmup and nothing is collected for parallel execution.
255+ * @brief Profile data for a constraint type, extracted from a throwaway builder.
256+ * @details Eventually this will be a compile-time table lookup. For now, it's computed
257+ * by running constraints on a throwaway builder and extracting the resulting state.
258+ */
259+ struct ConstraintProfile {
260+ UltraCircuitBuilder::TaskBlockSizes block_sizes;
261+ std::vector<bb::fr> constants; // constant values to pre-register
262+ std::vector<uint64_t > range_list_targets; // range list target ranges to pre-create
263+ std::vector<plookup::BasicTableId> table_ids; // lookup tables to pre-create
264+ };
265+
266+ /* *
267+ * @brief Profile a constraint type by running it on a throwaway builder and extracting cache state.
268+ * @details Runs two instances: the first triggers one-time setup, the second measures steady-state cost.
269+ * Extracts all constants, range list targets, and lookup table IDs that the constraint type needs.
270+ * This simulates the eventual table lookup.
260271 */
261272template <typename ConstraintType, typename Handler>
262- void warmup_and_collect (UltraCircuitBuilder& builder,
263- std::vector<ConstraintType>& items,
264- Handler&& handler,
265- std::vector<std::function<void (UltraCircuitBuilder&)>>& tasks,
266- std::vector<UltraCircuitBuilder::TaskBlockSizes>& task_sizes)
273+ ConstraintProfile profile_constraint_type (ConstraintType representative, Handler&& handler, size_t num_witnesses)
267274{
268- if (items.empty ()) {
269- return ;
275+ ConstraintProfile profile;
276+
277+ // Create a throwaway builder with enough witness slots
278+ WitnessVector dummy_witness (num_witnesses, bb::fr (0 ));
279+ UltraCircuitBuilder throwaway{ dummy_witness, {}, false };
280+
281+ // First instance: triggers one-time setup
282+ handler (throwaway, representative);
283+
284+ // Second instance: measures steady-state cost
285+ auto before = throwaway.snapshot_block_sizes ();
286+ handler (throwaway, representative);
287+ auto after = throwaway.snapshot_block_sizes ();
288+ profile.block_sizes = UltraCircuitBuilder::delta (before, after);
289+
290+ // Extract constants
291+ for (const auto & [value, _] : throwaway.constant_variable_indices ) {
292+ profile.constants .push_back (value);
270293 }
271294
272- using TaskBlockSizes = UltraCircuitBuilder::TaskBlockSizes;
295+ // Extract range list targets
296+ for (const auto & [target_range, _] : throwaway.range_lists ) {
297+ profile.range_list_targets .push_back (target_range);
298+ }
299+
300+ // Extract lookup table IDs
301+ for (const auto & table : throwaway.get_lookup_tables ()) {
302+ profile.table_ids .push_back (table.id );
303+ }
273304
274- // First instance: warmup to populate caches (size is inflated by one-time setup)
275- handler (builder, items[ 0 ]);
305+ return profile;
306+ }
276307
277- if (items.size () == 1 ) {
278- return ;
308+ /* *
309+ * @brief Prepare a builder's caches from constraint profiles WITHOUT running any constraints.
310+ * @details Populates the builder's constant cache, range lists, and lookup tables using data
311+ * extracted from profiles. After this, all parallel constraint execution will find everything
312+ * cached — no cache misses, no one-time setup costs.
313+ */
314+ void prepare_builder_from_profiles (UltraCircuitBuilder& builder, const std::vector<ConstraintProfile>& profiles)
315+ {
316+ // Register all constants from all profiles
317+ for (const auto & profile : profiles) {
318+ for (const auto & value : profile.constants ) {
319+ builder.put_constant_variable (value);
320+ }
279321 }
280322
281- // Second instance: measure steady-state per-instance size
282- auto before = builder.snapshot_block_sizes ();
283- handler (builder, items[1 ]);
284- auto after = builder.snapshot_block_sizes ();
285- TaskBlockSizes per_instance = UltraCircuitBuilder::delta (before, after);
323+ // Create all needed range lists
324+ for (const auto & profile : profiles) {
325+ for (const auto target_range : profile.range_list_targets ) {
326+ if (builder.range_lists .count (target_range) == 0 ) {
327+ builder.range_lists .insert ({ target_range, builder.create_range_list (target_range) });
328+ }
329+ }
330+ }
286331
287- // Collect remaining instances (from index 2 onward) as tasks
288- for (size_t i = 2 ; i < items.size (); i++) {
289- tasks.emplace_back ([&handler, &items, i](UltraCircuitBuilder& b) { handler (b, items[i]); });
290- task_sizes.push_back (per_instance);
332+ // Create all needed lookup tables
333+ for (const auto & profile : profiles) {
334+ for (const auto table_id : profile.table_ids ) {
335+ builder.get_table (table_id);
336+ }
291337 }
292338}
293339
@@ -297,125 +343,73 @@ void build_constraints_parallel(UltraCircuitBuilder& builder,
297343 size_t num_threads)
298344{
299345 using TaskBlockSizes = UltraCircuitBuilder::TaskBlockSizes;
346+ size_t num_witnesses = constraints.max_witness_index + 1 ;
347+
348+ // Phase 1: Profile each constraint type on throwaway builders (simulates table lookup).
349+ // Collect ALL instances as parallel tasks.
350+ std::vector<ConstraintProfile> profiles;
300351 std::vector<std::function<void (UltraCircuitBuilder&)>> tasks;
301352 std::vector<TaskBlockSizes> task_sizes;
302353
303- // Phase 1: Warmup — run one instance of each constraint type sequentially, in the same order
304- // as build_constraints. This populates caches (constants, range lists, lookup tables) so that
305- // subsequent parallel instances find everything cached. Also measure per-instance sizes.
306- warmup_and_collect (
307- builder,
308- constraints.quad_constraints ,
309- [](UltraCircuitBuilder& b, QuadConstraint& c) { create_quad_constraint (b, c); },
310- tasks,
311- task_sizes);
312-
313- warmup_and_collect (
314- builder,
315- constraints.big_quad_constraints ,
316- [](UltraCircuitBuilder& b, BigQuadConstraint& c) { create_big_quad_constraint (b, c); },
317- tasks,
318- task_sizes);
319-
320- warmup_and_collect (
321- builder,
322- constraints.logic_constraints ,
323- [](UltraCircuitBuilder& b, const LogicConstraint& c) {
324- create_logic_gate (b, c.a , c.b , c.result , c.num_bits , c.is_xor_gate );
325- },
326- tasks,
327- task_sizes);
328-
329- warmup_and_collect (
330- builder,
331- constraints.range_constraints ,
332- [](UltraCircuitBuilder& b, const RangeConstraint& c) {
333- b.create_dyadic_range_constraint (c.witness , c.num_bits , " parallel range constraint" );
334- },
335- tasks,
336- task_sizes);
337-
338- warmup_and_collect (
339- builder,
340- constraints.aes128_constraints ,
341- [](UltraCircuitBuilder& b, const AES128Constraint& c) { create_aes128_constraints (b, c); },
342- tasks,
343- task_sizes);
344-
345- warmup_and_collect (
346- builder,
347- constraints.sha256_compression ,
348- [](UltraCircuitBuilder& b, const Sha256Compression& c) { create_sha256_compression_constraints (b, c); },
349- tasks,
350- task_sizes);
351-
352- warmup_and_collect (
353- builder,
354- constraints.ecdsa_k1_constraints ,
355- [](UltraCircuitBuilder& b, const EcdsaConstraint& c) {
356- create_ecdsa_verify_constraints<stdlib::secp256k1<UltraCircuitBuilder>>(b, c);
357- },
358- tasks,
359- task_sizes);
360-
361- warmup_and_collect (
362- builder,
363- constraints.ecdsa_r1_constraints ,
364- [](UltraCircuitBuilder& b, const EcdsaConstraint& c) {
365- create_ecdsa_verify_constraints<stdlib::secp256r1<UltraCircuitBuilder>>(b, c);
366- },
367- tasks,
368- task_sizes);
369-
370- warmup_and_collect (
371- builder,
372- constraints.blake2s_constraints ,
373- [](UltraCircuitBuilder& b, const Blake2sConstraint& c) { create_blake2s_constraints (b, c); },
374- tasks,
375- task_sizes);
376-
377- warmup_and_collect (
378- builder,
379- constraints.blake3_constraints ,
380- [](UltraCircuitBuilder& b, const Blake3Constraint& c) { create_blake3_constraints (b, c); },
381- tasks,
382- task_sizes);
383-
384- warmup_and_collect (
385- builder,
386- constraints.keccak_permutations ,
387- [](UltraCircuitBuilder& b, const Keccakf1600& c) { create_keccak_permutations_constraints (b, c); },
388- tasks,
389- task_sizes);
390-
391- warmup_and_collect (
392- builder,
393- constraints.poseidon2_constraints ,
394- [](UltraCircuitBuilder& b, const Poseidon2Constraint& c) { create_poseidon2_permutations_constraints (b, c); },
395- tasks,
396- task_sizes);
397-
398- warmup_and_collect (
399- builder,
400- constraints.multi_scalar_mul_constraints ,
401- [](UltraCircuitBuilder& b, const MultiScalarMul& c) { create_multi_scalar_mul_constraint (b, c); },
402- tasks,
403- task_sizes);
404-
405- warmup_and_collect (
406- builder,
407- constraints.ec_add_constraints ,
408- [](UltraCircuitBuilder& b, const EcAdd& c) { create_ec_add_constraint (b, c); },
409- tasks,
410- task_sizes);
411-
412- // Phase 2: Execute all remaining instances in parallel
354+ // Helper: profile a constraint type and register all its instances as tasks
355+ auto profile_and_collect = [&](auto & items, auto handler) {
356+ if (items.empty ()) {
357+ return ;
358+ }
359+ auto profile = profile_constraint_type (items[0 ], handler, num_witnesses);
360+ profiles.push_back (profile);
361+ for (size_t i = 0 ; i < items.size (); i++) {
362+ tasks.emplace_back ([handler, &items, i](UltraCircuitBuilder& b) { handler (b, items[i]); });
363+ task_sizes.push_back (profile.block_sizes );
364+ }
365+ };
366+
367+ profile_and_collect (constraints.quad_constraints ,
368+ [](UltraCircuitBuilder& b, QuadConstraint& c) { create_quad_constraint (b, c); });
369+ profile_and_collect (constraints.big_quad_constraints ,
370+ [](UltraCircuitBuilder& b, BigQuadConstraint& c) { create_big_quad_constraint (b, c); });
371+ profile_and_collect (constraints.logic_constraints , [](UltraCircuitBuilder& b, const LogicConstraint& c) {
372+ create_logic_gate (b, c.a , c.b , c.result , c.num_bits , c.is_xor_gate );
373+ });
374+ profile_and_collect (constraints.range_constraints , [](UltraCircuitBuilder& b, const RangeConstraint& c) {
375+ b.create_dyadic_range_constraint (c.witness , c.num_bits , " parallel range constraint" );
376+ });
377+ profile_and_collect (constraints.aes128_constraints ,
378+ [](UltraCircuitBuilder& b, const AES128Constraint& c) { create_aes128_constraints (b, c); });
379+ profile_and_collect (constraints.sha256_compression , [](UltraCircuitBuilder& b, const Sha256Compression& c) {
380+ create_sha256_compression_constraints (b, c);
381+ });
382+ profile_and_collect (constraints.ecdsa_k1_constraints , [](UltraCircuitBuilder& b, const EcdsaConstraint& c) {
383+ create_ecdsa_verify_constraints<stdlib::secp256k1<UltraCircuitBuilder>>(b, c);
384+ });
385+ profile_and_collect (constraints.ecdsa_r1_constraints , [](UltraCircuitBuilder& b, const EcdsaConstraint& c) {
386+ create_ecdsa_verify_constraints<stdlib::secp256r1<UltraCircuitBuilder>>(b, c);
387+ });
388+ profile_and_collect (constraints.blake2s_constraints ,
389+ [](UltraCircuitBuilder& b, const Blake2sConstraint& c) { create_blake2s_constraints (b, c); });
390+ profile_and_collect (constraints.blake3_constraints ,
391+ [](UltraCircuitBuilder& b, const Blake3Constraint& c) { create_blake3_constraints (b, c); });
392+ profile_and_collect (constraints.keccak_permutations , [](UltraCircuitBuilder& b, const Keccakf1600& c) {
393+ create_keccak_permutations_constraints (b, c);
394+ });
395+ profile_and_collect (constraints.poseidon2_constraints , [](UltraCircuitBuilder& b, const Poseidon2Constraint& c) {
396+ create_poseidon2_permutations_constraints (b, c);
397+ });
398+ profile_and_collect (constraints.multi_scalar_mul_constraints , [](UltraCircuitBuilder& b, const MultiScalarMul& c) {
399+ create_multi_scalar_mul_constraint (b, c);
400+ });
401+ profile_and_collect (constraints.ec_add_constraints ,
402+ [](UltraCircuitBuilder& b, const EcAdd& c) { create_ec_add_constraint (b, c); });
403+
404+ // Phase 2: Prepare the builder's caches from profiles (no constraint execution).
405+ prepare_builder_from_profiles (builder, profiles);
406+
407+ // Phase 3: Execute ALL instances in parallel
413408 if (!tasks.empty ()) {
414409 builder.execute_parallel (tasks, task_sizes, num_threads);
415410 }
416411
417- // Phase 3: Block constraints and recursion constraints are processed sequentially — they have
418- // complex interdependencies and are typically few in number.
412+ // Phase 4: Block constraints and recursion constraints are processed sequentially.
419413 for (const auto & [constraint, opcode_indices] :
420414 zip_view (constraints.block_constraints , constraints.original_opcode_indices .block_constraints )) {
421415 create_block_constraints (builder, constraint);
0 commit comments