diff --git a/.github/workflows/build-c-libraries.yml b/.github/workflows/build-c-libraries.yml index 00ca38c9..451fabee 100644 --- a/.github/workflows/build-c-libraries.yml +++ b/.github/workflows/build-c-libraries.yml @@ -82,6 +82,18 @@ jobs: fetch-depth: 1 path: mpir_gc_x64 + - name: Ensure cmake available (macOS) + if: matrix.os.matrix == 'macos' + shell: bash + run: | + brew ls --versions cmake >/dev/null 2>&1 || brew install cmake + CMAKE_BIN="$(brew --prefix cmake)/bin" + if [ -d "$CMAKE_BIN" ]; then + echo "$CMAKE_BIN" >> "$GITHUB_PATH" + export PATH="$CMAKE_BIN:$PATH" + fi + cmake --version + - name: Build working-directory: src env: diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index cd6bec02..798241d7 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -102,6 +102,18 @@ jobs: with: python-version: ${{ matrix.python.major-dot-minor }} + - name: Ensure cmake available (macOS) + if: matrix.os.matrix == 'macos' + shell: bash + run: | + brew ls --versions cmake >/dev/null 2>&1 || brew install cmake + CMAKE_BIN="$(brew --prefix cmake)/bin" + if [ -d "$CMAKE_BIN" ]; then + echo "$CMAKE_BIN" >> "$GITHUB_PATH" + export PATH="$CMAKE_BIN:$PATH" + fi + cmake --version + - name: Install pipx run: | pip install pipx diff --git a/BBR_BLUEBOX_COMPACTION_OVERVIEW.md b/BBR_BLUEBOX_COMPACTION_OVERVIEW.md new file mode 100644 index 00000000..ba15ac86 --- /dev/null +++ b/BBR_BLUEBOX_COMPACTION_OVERVIEW.md @@ -0,0 +1,308 @@ +# BBR / Chia Bluebox VDF Proof Compaction — Overview + Implemented Performance Tricks + +This document summarizes how Chia “bluebox” compaction jobs are computed in this repo, and the performance tweaks we implemented in `bbr_chiavdf/` (a fork/copy of `chiavdf/`). + +It is written to be understandable even if you’re not already fluent with Chia’s VDF / classgroup implementation details. + +## 1) What “bluebox compaction” computes + +Chia blocks include a VDF output (“VDFInfo”) for some VDF “slot” (end-of-slot, signage point, infusion point, etc.). A full node can accept a **compact proof of time** (“VDFProof”) for a given VDFInfo: + +- Input element `x` (for compaction jobs this is always the **default/canonical classgroup element**, i.e. identity-ish input used by Chia) +- Discriminant `D` derived from the VDF challenge +- Number of iterations `T` +- Output element `y` (already known from the block’s VDFInfo) + +The bluebox worker’s job is to compute the **compact Wesolowski witness** `π` (witness_type = 0) such that the proof verifies for `(D, x, y, T)`. + +### Inputs (per proof job) + +All values are byte strings unless noted otherwise. + +- `challenge`: 32 bytes (`VDFInfo.challenge`) +- `T`: `u64` (`VDFInfo.number_of_iterations`) +- `y_ref`: 100 bytes serialized classgroup element (`VDFInfo.output`) +- `size_bits`: discriminant size in bits (typically 1024; Chia consensus constant) +- `x0`: canonical input element for compaction: + - `x0_bytes = ClassgroupElement.get_default_element().data` (100 bytes) + +### Outputs (per proof job) + +- `y`: serialized output element (should equal `y_ref`) +- `proof`: serialized witness element `π` (same size as `y`) +- In our C ABI wrappers we return `y || proof` (concatenation, typically 200 bytes for 1024-bit discriminants). + +## 2) Underlying primitives (high-level) + +Chia’s VDF uses the class group of binary quadratic forms for a negative discriminant `D`: + +- `D` is derived deterministically from `(challenge, size_bits)` via `CreateDiscriminant(...)`. +- Elements are represented as reduced forms `form { a, b, c }` (each is a GMP big integer). +- The VDF evaluation is the deterministic repeated squaring chain: + - `f(0) = x0` + - `f(t+1) = square(f(t))` (with reduction) + - `y = f(T) = x0^(2^T)` in the class group + +The compact proof is a Wesolowski proof, which (in this implementation) uses a per-proof prime `B` derived from the input and output: + +- `B = GetB(D, x0_form, y_ref_form)` where `GetB` hashes serialized forms then runs `HashPrime(...)`. +- Because `B` depends on `y_ref`, if `y_ref` is known up front then **`B` is known before squaring starts**. + +## 3) Baseline chiavdf “one-weso” compaction (two-phase) + +The upstream chiavdf compact witness path (“one-weso”) is: + +1. **Squaring phase** + - Run the VDF evaluation (sequential squaring) from `x0` to iteration `T`. + - Store many intermediate forms (“checkpoints”) in an array at a fixed cadence. + +2. **Proof phase** + - After squaring is finished, scan those stored checkpoints. + - Multiply them into “buckets” `ys[j][b]` using a mapping (`GetBlock`) that depends on `B`. + - Fold the bucket structure into a final `proof_form`. + +### Proof parameters `k, l, kl` + +chiavdf computes parameters: + +- `(k, l) = ApproximateParameters(T)` +- `kl = k * l` +- Number of checkpoint indices: + - `limit = ceil(T / kl)` (the number of checkpoint positions that may be used) + +These parameters control how many checkpoints are used and how bucket folding is structured. + +### Costs (baseline) + +- Memory: stores `O(ceil(T/kl))` checkpoint forms (each form holds several GMP big integers). +- Time: wall-clock is essentially `t_total = t_square + t_proof` because proof work happens after squaring. + +## 4) Tweak / Trick 1 — “Streaming one-weso” using known output (`y_ref`) + +### Key idea + +For bluebox compaction, `y_ref` is already known from the block. Because `B` depends on `y_ref`, we can compute `B` before starting squaring. + +That lets us avoid storing checkpoint forms and instead update the proof buckets **as soon as each checkpoint is reached**, using the current `f(t)` value. + +### Algorithm (single job, streaming buckets) + +Inputs: `(challenge, size_bits, x0_bytes, y_ref_bytes, T)` + +1. Compute `D = CreateDiscriminant(challenge, size_bits)` and `L = root(-D, 4)` (chiavdf convention). +2. Deserialize: + - `x0_form = DeserializeForm(D, x0_bytes)` + - `y_ref_form = DeserializeForm(D, y_ref_bytes)` +3. Compute: + - `B = GetB(D, x0_form, y_ref_form)` + - `(k, l) = ApproximateParameters(T)` (fallback `k=10,l=1` for small `T`) + - `kl = k*l` + - `limit = ceil(T/kl)` +4. Allocate buckets: + - `ys[j][b]` for `j ∈ [0, l)` and `b ∈ [0, 2^k)` + - Initialize all buckets to the identity form. +5. Run the VDF squaring chain up to `T`, but: + - At each checkpoint time `t = i*kl`, compute `checkpoint = f(t)` and call `process_checkpoint(i, checkpoint)`: + - For each `j ∈ [0, l)`: + - `p = i*l + j` + - If `T >= k*(p+1)`, compute `b = GetBlock(p, k, T, B)` + - Multiply `ys[j][b] *= checkpoint` (via `nucomp_form`). +6. At the end, compute `y = f(T)` and check `y == y_ref_form` (debug/safety guard). +7. Fold buckets to compute the final proof form (same folding logic as chiavdf). +8. Serialize `y` and `proof` and return `y || proof`. + +### What changed vs baseline + +- We no longer store an array of checkpoint forms. +- Bucket multiplication occurs “online” during squaring. +- Folding/finalization stays the same as chiavdf. + +### Costs / tradeoffs + +- Memory becomes `O(l * 2^k)` forms (the bucket table) instead of `O(ceil(T/kl))` checkpoint forms. +- Runtime can sometimes overlap bucket updates with squaring, but in practice the speedup depends on which part dominates (squaring vs `nucomp_form` multiplications). + +### Where this lives in `bbr_chiavdf/` + +- C ABI entrypoints: + - `chiavdf_prove_one_weso_fast_streaming(...)` + - `chiavdf_prove_one_weso_fast_streaming_with_progress(...)` +- Implementation: + - `bbr_chiavdf/src/c_bindings/fast_wrapper.cpp` + - `StreamingOneWesolowskiCallback` and the bucket helper (`StreamingWesolowskiBuckets`). + +## 5) GetBlock optimization (precompute `GetBlock(p)` table per job) + +In streaming (and in baseline), for each checkpoint update we need: + +- `b = GetBlock(p, k, T, B)` + +Naively this uses per-`p` modular exponentiation and division, which is expensive with GMP big integers. + +### Optimization idea + +For fixed `(T, k, B)`, define: + +- `r_p = 2^{T - k*(p+1)} mod B` +- `b_p = floor((r_p * 2^k) / B)` (integer division) + +Then: + +- `r_{p+1} = r_p * inv(2^k) mod B` where `inv(2^k)` is the modular inverse of `2^k mod B` + +So we can compute all `b_p` iteratively in `O(#p)` time with one modular inverse, instead of `O(#p)` modular exponentiations. + +### Tradeoff + +We store `precomputed_blocks[p]` for all `p` used by the proof: + +- Memory: `O(limit * l)` `u32` values per job. + - For typical compaction-scale `T` this is often a few MB per job. + +### Where this lives + +- `bbr_chiavdf/src/c_bindings/fast_wrapper.cpp`: + - `build_precomputed_getblocks(...)` + - Used by: + - `chiavdf_prove_one_weso_fast_streaming_getblock_opt(...)` + - `chiavdf_prove_one_weso_fast_streaming_getblock_opt_with_progress(...)` + +## 6) Trick 2 — discriminant reuse (“multi-target VDF engine”) + +### Key observation + +For a fixed group key `(challenge, size_bits, x0_bytes)`, the discriminant `D` and the entire squaring trajectory `f(t)` are identical for all jobs: + +- Only `T_j` and `y_ref_j` differ across jobs. + +Therefore, if you have `N` jobs sharing a group key: + +- Without reuse: total squaring work is `Σ T_j` +- With reuse: total squaring work is exactly `T_max = max(T_j)` + +### Grouping key + +Jobs can be grouped if and only if: + +- Same `challenge` +- Same `size_bits` +- Same `x0_bytes` + +For bluebox compaction, `x0_bytes` is always the default element, so grouping is mostly “same challenge”. + +### Algorithm (batch) + +Inputs (shared): + +- `challenge`, `x0_bytes`, `size_bits` + +Inputs (per job `j`): + +- `T_j`, `y_ref_j` + +Per job setup (done before squaring starts): + +1. Deserialize `y_ref_form_j` +2. Compute `B_j = GetB(D, x0_form, y_ref_form_j)` +3. Compute `(k_j, l_j)`, `kl_j`, `limit_j` +4. Allocate `ys_j` buckets (Trick 1) +5. Precompute `GetBlock` table for that job (GetBlock opt) + +Shared squaring run: + +- Run `repeated_square(T_max, ...)` once to generate `f(t)` for all times up to `T_max`. +- Maintain per job: + - `next_checkpoint_t_j` initialized to `kl_j` (we process `i=0` immediately at `t=0`) + - completion time `T_j` +- At each “event time” `t`: + 1. For every job where `t == next_checkpoint_t_j`: + - `i = t / kl_j` + - `ys_j` bucket update with checkpoint form `f(t)` (Trick 1) + - `next_checkpoint_t_j += kl_j` + 2. For every job where `t == T_j`: + - Debug check: `f(T_j) == y_ref_form_j` + - If mismatch: abort (signals backend grouping/data bug). + - Finalize proof for that job (fold buckets → proof form) and serialize result. + - Free job state (buckets, GetBlock table) to reduce peak RAM. + +### Concurrency / offloading finalization + +- The shared squaring chain itself is sequential by definition. +- Bucket updates are triggered by exact `f(t)` values; in our implementation they are done on the squaring callback thread to avoid copying forms or storing a large checkpoint history. +- Finalization (folding + serialization) is **per job** and can be offloaded: + - Once a job reaches `T_j` and passes the `f(T_j)==y_ref_j` check, its proof no longer depends on future squaring. + - We offload finalization to a `std::thread` per completed job so the squaring run can continue toward larger `T`. + +### Where this lives + +- New C ABI: + - `bbr_chiavdf/src/c_bindings/fast_wrapper.h`: + - `ChiavdfBatchJob` + - `chiavdf_prove_one_weso_fast_streaming_getblock_opt_batch(...)` + - `chiavdf_free_byte_array_batch(...)` +- Implementation: + - `bbr_chiavdf/src/c_bindings/fast_wrapper.cpp`: + - `BatchOneWesolowskiCallback` + - `BatchJobState` + - Uses `StreamingWesolowskiBuckets` per job + +### Error policy (mismatch) + +We keep a strict mismatch check specifically for debugging backend grouping / job data issues: + +- If the computed checkpoint `f(T_j)` differs from `y_ref_form_j`, the batch function returns `NULL` (fatal error). + +This is expected to be “should never happen” in normal operation, but is useful to detect wrong grouping inputs early. + +## 7) Rough resource model (what consumes time and RAM) + +### Time + +Three main contributors: + +1. **Squaring chain** (`repeated_square(...)`): inherently sequential per group. +2. **Bucket updates**: `nucomp_form` multiplications at checkpoint times; scales with number of jobs and number of checkpoints. +3. **Finalization**: folding buckets into a proof; per job. + +Trick 2 reduces (1) across jobs by reusing squaring work. + +### Memory (per job, within a group) + +Dominant memory terms: + +- Buckets: `l * 2^k` forms (each form holds multiple GMP big ints) — often several MB per job. +- GetBlock precompute: `limit * l` `u32` values — often a few MB per job. + +Peak memory per group is roughly linear in the number of jobs active at the same time (and drops as jobs complete and are freed). + +## 8) Things to look at next (possible improvement areas) + +This section is intentionally a “menu” for further investigation. + +1. **Hotspots inside classgroup arithmetic** + - If perf shows most time in `nucomp_form` / GMP, then: + - reduce allocations (GMP mpz churn) with pooling or reuse + - explore alternative big-int backends / tuned GMP build / CPU-specific flags + - reduce constant factors in `nucomp_form` (algorithmic / assembly improvements) + +2. **Reduce per-iteration callback overhead** + - Today `OnIteration` is called for every iteration, even though we only act on sparse “event times”. + - If this overhead becomes visible at huge `T`, consider: + - extending the core loop to support “next event” iteration skipping (intrusive change) + - or internal batching in the callback path + +3. **Finalization optimization** + - Each job finalization constructs a reducer and folds buckets. + - Potential wins: + - reuse reducers per thread + - reduce intermediate `form` temporaries and copies + +4. **Group sizing / scheduling** + - For Trick 2, there’s a throughput vs RAM tradeoff. + - Consider dynamic group size based on memory budget and `T` distribution. + +5. **Optional: parallelize bucket updates (hard)** + - Bucket updates need the checkpoint form `f(t)` at exact times. + - Parallelizing this without copying/storing forms requires careful design (e.g. immutable snapshots, reference counting, or storing a checkpoint history). + - This is the next “big step” if per-job proof work becomes the bottleneck even after squaring reuse. + diff --git a/docs/bluebox_compaction.md b/docs/bluebox_compaction.md new file mode 100644 index 00000000..61cd1fd4 --- /dev/null +++ b/docs/bluebox_compaction.md @@ -0,0 +1,49 @@ +# Bluebox Compaction Optimizations + +This document describes the compaction-oriented proving path exposed by +`src/c_bindings/fast_wrapper.h` and implemented in +`src/c_bindings/fast_wrapper.cpp`. + +## Scope + +These APIs are intended for workloads where the expected VDF output (`y_ref`) is +already known up front (for example, bluebox compaction jobs). They are additive +and do not change the existing `c_wrapper` APIs. + +## Optimization 1: Streaming one-wesolowski + +Given `y_ref`, the prover computes: + +- `B = GetB(D, x, y_ref)` before squaring starts + +This enables a streaming algorithm that updates proof buckets at each +checkpoint during repeated squaring, instead of materializing the full +intermediate checkpoint array and scanning it after the loop. In practice this +substantially reduces memory usage for compaction workloads. + +## Optimization 2: Incremental GetBlock mapping + +For streaming checkpoint updates, bucket index selection repeatedly calls +`GetBlock(p, k, T, B)`. The optimized mode keeps a rolling modular state and +advances sequential `p` values incrementally, avoiding full modular +exponentiation per call and avoiding a large lookup table. + +## Optimization 3: Memory-budgeted (k, l) tuning + +The wrapper can tune `(k, l)` under a configured memory budget: + +- `chiavdf_set_bucket_memory_budget_bytes(...)` + +If no tuned candidate is found, the code falls back to the standard parameter +heuristics. + +## Operational Notes + +- The `fast_wrapper` code path sets one-wesolowski mode and uses `quiet_mode` to + avoid unsolicited stdout noise when embedded in multi-worker clients. +- Thread-slot assignment for the fast VDF counters is per-thread via + `vdf_fast_pairindex()`, avoiding slot collisions when multiple VDF computations + run in one process. +- The production default for `enable_threads` in `parameters.h` is unchanged from + upstream to preserve timelord expectations. + diff --git a/src/Makefile.vdf-client b/src/Makefile.vdf-client index 59fcbb63..0fe2380a 100644 --- a/src/Makefile.vdf-client +++ b/src/Makefile.vdf-client @@ -26,15 +26,26 @@ ifeq ($(UNAME),Darwin) NOPIE = endif -CFLAGS += $(LTO_FLAGS) $(NOPIE) -LDFLAGS += $(LTO_FLAGS) $(NOPIE) -g +# Optional: set `PIC=1` to build position-independent objects. +PIC ?= 0 +ifeq ($(PIC),1) +PICFLAGS = -fPIC +PIEFLAGS = +else +PICFLAGS = +PIEFLAGS = $(NOPIE) +endif + +CFLAGS += $(LTO_FLAGS) $(PIEFLAGS) $(PICFLAGS) +LDFLAGS += $(LTO_FLAGS) $(PIEFLAGS) -g ifeq ($(OS),Windows_NT) LDLIBS += -lmpirxx -lmpir -lws2_32 -CXXFLAGS += $(LTO_FLAGS) -std=c++1z -D VDF_MODE=0 -D FAST_MACHINE=1 $(NOPIE) -fvisibility=hidden +CXXFLAGS += $(LTO_FLAGS) -std=c++1z -D VDF_MODE=0 -D FAST_MACHINE=1 $(PIEFLAGS) $(PICFLAGS) -fvisibility=hidden else LDLIBS += -lgmpxx -lgmp -pthread -CXXFLAGS += $(LTO_FLAGS) -std=c++1z -D VDF_MODE=0 -D FAST_MACHINE=1 -pthread $(NOPIE) -fvisibility=hidden +CXXFLAGS += $(LTO_FLAGS) -std=c++1z -D VDF_MODE=0 -D FAST_MACHINE=1 -pthread $(PIEFLAGS) $(PICFLAGS) -fvisibility=hidden endif +ASFLAGS += $(PICFLAGS) ifeq ($(UNAME),Darwin) CXXFLAGS += -D CHIAOSX=1 # Homebrew (common on macOS) installs boost/gmp to /opt/homebrew or /usr/local @@ -81,7 +92,7 @@ BINS = vdf_client prover_test 1weso_test 2weso_test vdf_bench all: $(BINS) clean: - rm -f *.o hw/*.o $(BINS) compile_asm emu_hw_test hw_test hw_vdf_client emu_hw_vdf_client + rm -f *.o hw/*.o c_bindings/*.o $(BINS) compile_asm emu_hw_test hw_test hw_vdf_client emu_hw_vdf_client libchiavdf_fastc.a $(BINS) avx512_test: %: %.o lzcnt.o $(ASM_OBJS) $(CXX) $(LDFLAGS) -o $@ $^ $(LDLIBS) @@ -91,6 +102,9 @@ $(addsuffix .o,$(BINS)) avx512_test.o: CXXFLAGS += $(OPT_CFLAGS) lzcnt.o: refcode/lzcnt.c $(CC) $(CFLAGS) -c refcode/lzcnt.c +%.o: %.s + $(CC) -c $< -o $@ $(ASFLAGS) + asm_compiled.s: compile_asm ./compile_asm @@ -104,6 +118,21 @@ compile_asm: compile_asm.o $(CXX) $(LDFLAGS) -o $@ $^ $(LDLIBS) HW_OBJS = $(addprefix hw/,hw_util.o hw_proof.o hw_interface.o chia_driver.o ftdi_driver.o vdf_driver.o pll_freqs.o) vdf_base_hw.o vdf_hw_symbol_anchors.o prover_runtime.o lzcnt.o +# --------------------------------------------------------------------------- +# Static library: fast one-wesolowski proof (BBR integration) +# --------------------------------------------------------------------------- + +FASTLIB = libchiavdf_fastc.a +FASTLIB_OBJS = c_bindings/fast_wrapper.o lzcnt.o $(ASM_OBJS) + +.PHONY: fastlib + +fastlib: $(FASTLIB) + +$(FASTLIB): $(FASTLIB_OBJS) + $(AR) rcs $@ $^ + +c_bindings/fast_wrapper.o: CXXFLAGS += $(OPT_CFLAGS) EMU_OBJS = hw/emu_funcs.o hw/emu_runner.o ifeq ($(OS),Windows_NT) HW_LIB = hw/libft4222/libft4222.lib diff --git a/src/c_bindings/fast_wrapper.cpp b/src/c_bindings/fast_wrapper.cpp new file mode 100644 index 00000000..9184311c --- /dev/null +++ b/src/c_bindings/fast_wrapper.cpp @@ -0,0 +1,1507 @@ +#include "fast_wrapper.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../vdf.h" +#include "../create_discriminant.h" + +// Runtime configuration knobs required by `parameters.h`. +// These are `extern` variables there, but each binary defines them explicitly. +bool use_divide_table = false; +int gcd_base_bits = 50; +int gcd_128_max_iter = 3; +std::string asmprefix = "cel_"; +bool enable_all_instructions = false; + +namespace { +std::once_flag init_once; +std::atomic bucket_memory_budget_bytes(128ULL * 1024ULL * 1024ULL); +std::atomic streaming_stats_enabled(false); + +struct LastStreamingParameters { + uint32_t k = 0; + uint32_t l = 0; + bool tuned = false; + bool set = false; +}; + +thread_local LastStreamingParameters last_streaming_parameters; + +struct LastStreamingStats { + uint64_t checkpoint_total_ns = 0; + uint64_t checkpoint_event_total_ns = 0; + uint64_t finalize_total_ns = 0; + uint64_t checkpoint_calls = 0; + uint64_t bucket_updates = 0; + bool set = false; +}; + +thread_local LastStreamingStats last_streaming_stats; + +void init_chiavdf_fast() { + init_gmp(); + set_rounding_mode(); + + // Match the vdf_client runtime selection for AVX2. + if (hasAVX2()) { + gcd_base_bits = 63; + gcd_128_max_iter = 2; + } else { + gcd_base_bits = 50; + gcd_128_max_iter = 3; + } + + // Ensure we run the one-wesolowski path by default. + fast_algorithm = false; + two_weso = false; + quiet_mode = true; +} + +ChiavdfByteArray empty_result() { return ChiavdfByteArray{nullptr, 0}; } + +bool try_pow2_u64_shift(uint32_t shift, uint64_t& out) { + if (shift >= 64) { + return false; + } + out = 1ULL << shift; + return true; +} + +uint64_t estimate_bucket_form_bytes(size_t discriminant_size_bits) { + // Be conservative: class group forms contain 3 GMP-backed integers that + // quickly grow to the discriminant size (or beyond) during NUCOMP. + // + // This estimate is intentionally larger than the raw serialized size to + // avoid picking parameters that risk paging/OOM. + uint64_t discr_bytes = (static_cast(discriminant_size_bits) + 7) / 8; + uint64_t estimate = discr_bytes * 16; + if (estimate < 2048) { + estimate = 2048; + } + return estimate; +} + +bool tune_streaming_parameters( + uint64_t num_iterations, + size_t discriminant_size_bits, + uint64_t memory_budget_bytes, + uint32_t& out_l, + uint32_t& out_k) { + if (memory_budget_bytes == 0) { + return false; + } + + // Keep headroom for GMP scratch allocations and general process overhead. + uint64_t budget = (memory_budget_bytes * 80) / 100; + uint64_t bytes_per_form = estimate_bucket_form_bytes(discriminant_size_bits); + if (budget < bytes_per_form) { + return false; + } + + unsigned __int128 best_cost = std::numeric_limits::max(); + bool found = false; +#ifndef NDEBUG + uint32_t best_k = 0; + uint32_t best_l = 0; + unsigned __int128 best_updates = 0; + unsigned __int128 best_checkpoints = 0; + unsigned __int128 best_fold = 0; +#endif + + // Empirical tuning notes (1024-bit discriminants, AVX2 build): + // - Each bucket update (NUCOMP) and each fold unit is ~5µs. + // - Per-checkpoint event overhead (SetForm + bookkeeping) is ~0.3µs. + // + // So checkpoint counts should be weighted much lower than updates/fold. + constexpr unsigned __int128 update_weight = 16; + constexpr unsigned __int128 fold_weight = 16; + constexpr unsigned __int128 checkpoint_weight = 1; + + // Search a small grid of `(k,l)` values. Higher `k` reduces checkpoint work + // (~T/k) but increases fold work (~l·2^k) and bucket memory (~l·2^k). + for (uint32_t k = 4; k <= 20; k++) { + unsigned __int128 buckets_per_row = static_cast(1) << k; + + for (uint32_t l = 1; l <= 64; l++) { + unsigned __int128 form_count = buckets_per_row * static_cast(l); + unsigned __int128 mem_required = + form_count * static_cast(bytes_per_form); + if (mem_required > static_cast(budget)) { + continue; + } + + uint64_t kl = static_cast(k) * static_cast(l); + unsigned __int128 checkpoints = static_cast( + (num_iterations + kl - 1) / kl); + // Each checkpoint can trigger up to `l` bucket updates (one per sub-block). + // Model update work as checkpoint-count scaled by `l`. + unsigned __int128 updates = checkpoints * static_cast(l); + unsigned __int128 fold = static_cast(l) << (k + 1); + unsigned __int128 cost = + updates * update_weight + checkpoints * checkpoint_weight + fold * fold_weight; + + if (!found || cost < best_cost) { + found = true; + best_cost = cost; + out_k = k; + out_l = l; +#ifndef NDEBUG + best_k = k; + best_l = l; + best_updates = updates; + best_checkpoints = checkpoints; + best_fold = fold; +#endif + } + } + } + +#ifndef NDEBUG + if (found) { + assert(best_k >= 4 && best_k <= 20); + assert(best_l >= 1 && best_l <= 64); + std::fprintf( + stderr, + "[chiavdf] tune_streaming_parameters: T=%llu, budget=%llu, selected=(k=%u,l=%u), " + "components{updates=%llu, checkpoints=%llu, fold=%llu}, weights{u=16,c=1,f=16}\n", + static_cast(num_iterations), + static_cast(memory_budget_bytes), + best_k, + best_l, + static_cast(best_updates), + static_cast(best_checkpoints), + static_cast(best_fold)); + if (best_k == 20 && num_iterations < (1ULL << 24)) { + std::fprintf( + stderr, + "[chiavdf] tune_streaming_parameters: high-k selection for moderate T " + "(k=20, T=%llu); verify measured update/fold timing assumptions.\n", + static_cast(num_iterations)); + } + } +#endif + + return found; +} + +uint64_t get_block(uint64_t i, uint64_t k, uint64_t T, integer& B) { + integer res = FastPow(2, T - k * (i + 1), B); + mpz_mul_2exp(res.impl, res.impl, k); + res = res / B; + auto res_vector = res.to_vector(); + return res_vector.empty() ? 0 : res_vector[0]; +} + +class ProgressOneWesolowskiCallback final : public OneWesolowskiCallback { + public: + ProgressOneWesolowskiCallback( + integer& D, + form& f, + uint64_t wanted_iter, + uint64_t progress_interval, + ChiavdfProgressCallback progress_cb, + void* progress_user_data) + : OneWesolowskiCallback(D, f, wanted_iter), + progress_interval(progress_interval), + progress_cb(progress_cb), + progress_user_data(progress_user_data), + next_progress(progress_interval) {} + + void OnIteration(int type, void* data, uint64_t iteration) override { + OneWesolowskiCallback::OnIteration(type, data, iteration); + + if (progress_cb == nullptr || progress_interval == 0) { + return; + } + + uint64_t done = iteration + 1; + if (done > wanted_iter) { + return; + } + + if (done >= next_progress) { + progress_cb(next_progress, progress_user_data); + next_progress += progress_interval; + } + } + + private: + uint64_t progress_interval; + ChiavdfProgressCallback progress_cb; + void* progress_user_data; + uint64_t next_progress; +}; + +class StreamingWesolowskiBuckets { + public: + StreamingWesolowskiBuckets( + integer& D, + integer& L, + uint64_t wanted_iter, + uint32_t k, + uint32_t l, + uint64_t limit, + integer B, + bool use_getblock_opt) + : D(D), + L(L), + wanted_iter(wanted_iter), + k(k), + l(l), + kl(static_cast(k) * static_cast(l)), + limit(limit), + B(std::move(B)), + use_getblock_opt(use_getblock_opt), + stats_enabled(streaming_stats_enabled.load(std::memory_order_relaxed)) { + form id = form::identity(D); + uint64_t bucket_span_u64 = 0; + if (!try_pow2_u64_shift(k, bucket_span_u64)) { + getblock_ok = false; + return; + } + + bucket_span = static_cast(bucket_span_u64); + if (bucket_span != 0 && static_cast(l) > std::numeric_limits::max() / bucket_span) { + getblock_ok = false; + return; + } + + buckets.resize(static_cast(l) * bucket_span, id); + + if (use_getblock_opt) { + getblock_ok = init_getblock_opt_state(); + } + } + + uint64_t wanted_iterations() const { return wanted_iter; } + + uint64_t checkpoint_stride() const { return kl; } + + uint64_t checkpoint_limit() const { return limit; } + + void process_checkpoint(uint64_t i, const form& checkpoint, bool record_stats = true) { + const bool do_stats = stats_enabled && record_stats; + auto started_at = std::chrono::steady_clock::time_point{}; + if (do_stats) { + started_at = std::chrono::steady_clock::now(); + } + + uint64_t local_updates = 0; + for (uint32_t j = 0; j < l; j++) { + uint64_t p = i * static_cast(l) + static_cast(j); + uint64_t needed = static_cast(k) * (p + 1); + if (wanted_iter < needed) { + break; + } + uint64_t b = use_getblock_opt ? get_block_opt(p) : get_block(p, k, wanted_iter, B); + if (do_stats) { + local_updates++; + } + nucomp_form(bucket(j, b), bucket(j, b), checkpoint, D, L); + } + + if (do_stats) { + checkpoint_calls++; + bucket_updates += local_updates; + checkpoint_total_ns += static_cast( + std::chrono::duration_cast( + std::chrono::steady_clock::now() - started_at) + .count()); + } + } + + bool init_ok() const { return getblock_ok; } + + form finalize_proof() const { + PulmarkReducer reducer; + return finalize_proof_with_reducer(reducer); + } + + form finalize_proof_with_reducer(PulmarkReducer& reducer) const { + auto started_at = std::chrono::steady_clock::time_point{}; + if (stats_enabled) { + started_at = std::chrono::steady_clock::now(); + } + + form id = form::identity(D); + + uint64_t k1 = k / 2; + uint64_t k0 = k - k1; + uint64_t span_k0 = 0; + uint64_t span_k1 = 0; + if (!try_pow2_u64_shift(static_cast(k0), span_k0) || + !try_pow2_u64_shift(static_cast(k1), span_k1)) { + return form::identity(D); + } + form x = id; + + for (int64_t j = static_cast(l) - 1; j >= 0; j--) { + x = FastPowFormNucomp(x, D, integer(static_cast(bucket_span)), L, reducer); + + for (uint64_t b1 = 0; b1 < span_k1; b1++) { + form z = id; + for (uint64_t b0 = 0; b0 < (1ULL << k0); b0++) { + nucomp_form( + z, + z, + bucket(static_cast(j), b1 * (1ULL << k0) + b0), + D, + L); + } + z = FastPowFormNucomp( + z, + D, + integer(static_cast(b1 * span_k0)), + L, + reducer); + nucomp_form(x, x, z, D, L); + } + + for (uint64_t b0 = 0; b0 < span_k0; b0++) { + form z = id; + for (uint64_t b1 = 0; b1 < (1ULL << k1); b1++) { + nucomp_form( + z, + z, + bucket(static_cast(j), b1 * (1ULL << k0) + b0), + D, + L); + } + z = FastPowFormNucomp(z, D, integer(b0), L, reducer); + nucomp_form(x, x, z, D, L); + } + } + + reducer.reduce(x); + + if (stats_enabled) { + finalize_total_ns += static_cast( + std::chrono::duration_cast( + std::chrono::steady_clock::now() - started_at) + .count()); + } + return x; + } + + bool stats_ok() const { return stats_enabled; } + + void record_checkpoint_event_ns(uint64_t ns) { + if (stats_enabled) { + checkpoint_event_total_ns += ns; + } + } + + LastStreamingStats stats() const { + LastStreamingStats out; + out.checkpoint_total_ns = checkpoint_total_ns; + out.checkpoint_event_total_ns = checkpoint_event_total_ns; + out.finalize_total_ns = finalize_total_ns; + out.checkpoint_calls = checkpoint_calls; + out.bucket_updates = bucket_updates; + out.set = stats_enabled; + return out; + } + + private: + form& bucket(uint32_t j, uint64_t b) { + size_t idx = static_cast(j) * bucket_span + static_cast(b); + return buckets[idx]; + } + + const form& bucket(uint32_t j, uint64_t b) const { + size_t idx = static_cast(j) * bucket_span + static_cast(b); + return buckets[idx]; + } + + integer& D; + integer& L; + uint64_t wanted_iter; + uint32_t k; + uint32_t l; + uint64_t kl; + uint64_t limit; + integer B; + std::vector
buckets; + + bool use_getblock_opt; + bool getblock_ok = true; + uint64_t getblock_next_p = 0; + integer getblock_inv_2k; + integer getblock_r; + integer getblock_tmp; + + bool stats_enabled; + uint64_t checkpoint_total_ns = 0; + uint64_t checkpoint_event_total_ns = 0; + mutable uint64_t finalize_total_ns = 0; + uint64_t checkpoint_calls = 0; + uint64_t bucket_updates = 0; + + bool init_getblock_opt_state() { + if (k == 0) { + return false; + } + uint64_t k_u64 = static_cast(k); + if (wanted_iter < k_u64) { + getblock_next_p = 0; + return true; + } + + integer two_k_mod = FastPow(2, k_u64, B); + if (mpz_invert(getblock_inv_2k.impl, two_k_mod.impl, B.impl) == 0) { + return false; + } + + getblock_r = FastPow(2, wanted_iter - k_u64, B); + getblock_next_p = 0; + return true; + } + + uint64_t get_block_opt(uint64_t p) { + if (!getblock_ok || wanted_iter < static_cast(k)) { + return get_block(p, k, wanted_iter, B); + } + + // Expected call pattern is sequential `p`. If we ever get out of sync, + // advance state forward or fall back to the slow mapping. + if (p < getblock_next_p) { + return get_block(p, k, wanted_iter, B); + } + while (getblock_next_p < p) { + mpz_mul(getblock_r.impl, getblock_r.impl, getblock_inv_2k.impl); + mpz_mod(getblock_r.impl, getblock_r.impl, B.impl); + getblock_next_p++; + } + + mpz_mul_2exp(getblock_tmp.impl, getblock_r.impl, k); + mpz_fdiv_q(getblock_tmp.impl, getblock_tmp.impl, B.impl); + uint64_t b = mpz_get_ui(getblock_tmp.impl); + + mpz_mul(getblock_r.impl, getblock_r.impl, getblock_inv_2k.impl); + mpz_mod(getblock_r.impl, getblock_r.impl, B.impl); + getblock_next_p++; + + return b; + } +}; + +class StreamingOneWesolowskiCallback final : public WesolowskiCallback { + public: + StreamingOneWesolowskiCallback( + integer& discriminant, + uint64_t wanted_iter, + uint32_t k, + uint32_t l, + uint64_t limit, + integer B, + bool use_getblock_opt, + uint64_t progress_interval, + ChiavdfProgressCallback progress_cb, + void* progress_user_data) + : WesolowskiCallback(discriminant), + buckets( + this->D, + this->L, + wanted_iter, + k, + l, + limit, + std::move(B), + use_getblock_opt), + progress_interval(progress_interval), + progress_cb(progress_cb), + progress_user_data(progress_user_data), + next_progress(progress_interval) {} + + bool init_ok() const { return buckets.init_ok(); } + + void OnBatchReplay(uint64_t base_iteration, uint64_t batch_size) override { + (void)base_iteration; + (void)batch_size; + replayed_after_corruption = true; + } + + void OnIteration(int type, void* data, uint64_t iteration) override { + if (replayed_after_corruption) { + return; + } + iteration++; + if (iteration > buckets.wanted_iterations()) { + return; + } + + if (progress_cb != nullptr && progress_interval != 0 && iteration >= next_progress) { + progress_cb(next_progress, progress_user_data); + next_progress += progress_interval; + } + + uint64_t stride = buckets.checkpoint_stride(); + if (stride != 0 && iteration % stride == 0) { + uint64_t pos = iteration / stride; + if (pos < buckets.checkpoint_limit()) { + form checkpoint; + auto started_at = std::chrono::steady_clock::time_point{}; + const bool do_stats = buckets.stats_ok(); + if (do_stats) { + started_at = std::chrono::steady_clock::now(); + } + SetForm(type, data, &checkpoint); + buckets.process_checkpoint(pos, checkpoint); + if (do_stats) { + buckets.record_checkpoint_event_ns(static_cast( + std::chrono::duration_cast( + std::chrono::steady_clock::now() - started_at) + .count())); + } + } + } + + if (iteration == buckets.wanted_iterations()) { + SetForm(type, data, &result); + has_result = true; + } + } + + void process_checkpoint(uint64_t i, const form& checkpoint, bool record_stats = true) { + buckets.process_checkpoint(i, checkpoint, record_stats); + } + + bool ok() const { return has_result && !replayed_after_corruption; } + + const form& y() const { return result; } + + form finalize_proof() const { return buckets.finalize_proof(); } + + bool stats_ok() const { return buckets.stats_ok(); } + + LastStreamingStats stats() const { return buckets.stats(); } + + private: + StreamingWesolowskiBuckets buckets; + uint64_t progress_interval; + ChiavdfProgressCallback progress_cb; + void* progress_user_data; + uint64_t next_progress; + + form result; + bool has_result = false; + bool replayed_after_corruption = false; +}; + +struct BatchJobState { + size_t index; + uint64_t wanted_iter; + uint32_t k; + uint32_t l; + uint64_t kl; + uint64_t limit; + form y_ref; + StreamingWesolowskiBuckets buckets; + uint64_t next_checkpoint_t; + uint64_t next_event_t; + bool done = false; + + BatchJobState( + size_t index, + uint64_t wanted_iter, + uint32_t k, + uint32_t l, + uint64_t limit, + form y_ref, + StreamingWesolowskiBuckets buckets) + : index(index), + wanted_iter(wanted_iter), + k(k), + l(l), + kl(static_cast(k) * static_cast(l)), + limit(limit), + y_ref(std::move(y_ref)), + buckets(std::move(buckets)), + next_checkpoint_t( + (limit <= 1) ? std::numeric_limits::max() + : (static_cast(k) * static_cast(l))), + next_event_t(std::min(wanted_iter, next_checkpoint_t)) {} +}; + +struct BatchFinalizeLatch { + void add_task() { + std::lock_guard lk(mutex); + remaining++; + } + + void task_done() { + std::lock_guard lk(mutex); + if (remaining > 0) { + remaining--; + } + if (remaining == 0) { + cv.notify_all(); + } + } + + void wait() { + std::unique_lock lk(mutex); + cv.wait(lk, [this]() { return remaining == 0; }); + } + + private: + std::mutex mutex; + std::condition_variable cv; + size_t remaining = 0; +}; + +struct BatchFinalizeTask { + size_t idx; + int d_bits; + ChiavdfByteArray* out_arrays; + form y_ref; + StreamingWesolowskiBuckets buckets; + std::shared_ptr latch; + + BatchFinalizeTask( + size_t idx, + int d_bits, + ChiavdfByteArray* out_arrays, + form y_ref, + StreamingWesolowskiBuckets buckets, + std::shared_ptr latch) + : idx(idx), + d_bits(d_bits), + out_arrays(out_arrays), + y_ref(std::move(y_ref)), + buckets(std::move(buckets)), + latch(std::move(latch)) {} +}; + +class GlobalBatchFinalizerPool final { + public: + static GlobalBatchFinalizerPool& instance() { + static GlobalBatchFinalizerPool pool; + return pool; + } + + void enqueue(BatchFinalizeTask task) { + { + std::lock_guard lk(mutex); + queue.push(std::move(task)); + } + cv.notify_one(); + } + + private: + GlobalBatchFinalizerPool() { start_workers(); } + + ~GlobalBatchFinalizerPool() { + { + std::lock_guard lk(mutex); + shutdown = true; + } + cv.notify_all(); + for (auto& t : workers) { + if (t.joinable()) { + t.join(); + } + } + } + + GlobalBatchFinalizerPool(const GlobalBatchFinalizerPool&) = delete; + GlobalBatchFinalizerPool& operator=(const GlobalBatchFinalizerPool&) = delete; + + void start_workers() { + size_t count = std::thread::hardware_concurrency(); + if (count == 0) { + count = 1; + } + // Keep this intentionally small: the caller already runs many compute + // threads (Rust `-p` workers), and each extra C++ worker carries large + // thread-local GMP scratch state (NUCOMP/NUDUPL). + count = std::max(1, count / 4); + count = std::min(count, 8); + + workers.reserve(count); + for (size_t i = 0; i < count; i++) { + workers.emplace_back([this]() { worker_loop(); }); + } + } + + std::optional take_task() { + std::unique_lock lk(mutex); + cv.wait(lk, [this]() { return shutdown || !queue.empty(); }); + if (queue.empty()) { + return std::nullopt; + } + BatchFinalizeTask task = std::move(queue.front()); + queue.pop(); + return std::make_optional(std::move(task)); + } + + void worker_loop() { + PulmarkReducer reducer; + while (true) { + auto task_opt = take_task(); + if (!task_opt.has_value()) { + return; + } + BatchFinalizeTask task = std::move(*task_opt); + + struct LatchGuard { + std::shared_ptr latch; + ~LatchGuard() { + if (latch) { + latch->task_done(); + } + } + } guard{task.latch}; + + try { + form proof_form = task.buckets.finalize_proof_with_reducer(reducer); + std::vector y_serialized = SerializeForm(task.y_ref, task.d_bits); + std::vector proof_serialized = SerializeForm(proof_form, task.d_bits); + if (y_serialized.empty() || proof_serialized.empty()) { + task.out_arrays[task.idx] = empty_result(); + continue; + } + + const size_t total = y_serialized.size() + proof_serialized.size(); + uint8_t* out = new uint8_t[total]; + std::copy(y_serialized.begin(), y_serialized.end(), out); + std::copy(proof_serialized.begin(), proof_serialized.end(), out + y_serialized.size()); + task.out_arrays[task.idx] = ChiavdfByteArray{out, total}; + } catch (...) { + task.out_arrays[task.idx] = empty_result(); + } + } + } + + std::vector workers; + std::mutex mutex; + std::condition_variable cv; + std::queue queue; + bool shutdown = false; +}; + +class BatchOneWesolowskiCallback final : public WesolowskiCallback { + public: + BatchOneWesolowskiCallback( + integer& D, + const integer& shared_D, + const integer& shared_L, + int d_bits, + ChiavdfByteArray* out_arrays, + size_t job_count, + std::atomic& stopped, + std::vector jobs, + uint64_t progress_interval, + ChiavdfProgressCallback progress_cb, + void* progress_user_data) + : WesolowskiCallback(D), + shared_D(shared_D), + shared_L(shared_L), + d_bits(d_bits), + out_arrays(out_arrays), + job_count(job_count), + stopped(stopped), + jobs(std::move(jobs)), + progress_interval(progress_interval), + progress_cb(progress_cb), + progress_user_data(progress_user_data), + next_progress(progress_interval), + finalizer_latch(std::make_shared()) { + } + + void initialize(const form& x0) { + for (size_t job_pos = 0; job_pos < jobs.size(); job_pos++) { + auto& job = jobs[job_pos]; + job.buckets.process_checkpoint(/*i=*/0, x0, /*record_stats=*/false); + schedule_job(job_pos); + } + refresh_next_event(); + } + + void OnBatchReplay(uint64_t base_iteration, uint64_t batch_size) override { + (void)base_iteration; + (void)batch_size; + // Streaming bucket updates are irreversible, so fail closed if replay is needed. + fatal_error = true; + stopped.store(true); + } + + void OnIteration(int type, void* data, uint64_t iteration) override { + if (fatal_error) { + return; + } + iteration++; + if (progress_cb != nullptr && progress_interval != 0 && iteration >= next_progress) { + progress_cb(next_progress, progress_user_data); + next_progress += progress_interval; + } + if (iteration != next_event) { + return; + } + + form checkpoint; + SetForm(type, data, &checkpoint); + + while (!event_queue.empty()) { + const JobEvent next = event_queue.top(); + if (next.t != iteration) { + break; + } + event_queue.pop(); + + BatchJobState& job = jobs[next.job_pos]; + if (job.done || job.next_event_t != iteration) { + continue; + } + + if (job.next_checkpoint_t == iteration) { + uint64_t i = iteration / job.kl; + if (i < job.limit) { + job.buckets.process_checkpoint(i, checkpoint); + } + + const uint64_t next_i = i + 1; + if (next_i < job.limit) { + job.next_checkpoint_t = next_i * job.kl; + } else { + job.next_checkpoint_t = std::numeric_limits::max(); + } + } + + if (job.wanted_iter == iteration) { + if (!(checkpoint == job.y_ref)) { + fatal_error = true; + stopped.store(true); + return; + } + spawn_finalize_job(job); + } + + if (!job.done) { + schedule_job(next.job_pos); + } + } + + refresh_next_event(); + } + + bool ok() const { return !fatal_error; } + + void join_finalizers() { + finalizer_latch->wait(); + } + + private: + struct JobEvent { + uint64_t t; + size_t job_pos; + }; + + struct JobEventGreater { + bool operator()(const JobEvent& a, const JobEvent& b) const noexcept { return a.t > b.t; } + }; + + void schedule_job(size_t job_pos) { + BatchJobState& job = jobs[job_pos]; + if (job.done) { + job.next_event_t = std::numeric_limits::max(); + return; + } + job.next_event_t = std::min(job.wanted_iter, job.next_checkpoint_t); + event_queue.push(JobEvent{job.next_event_t, job_pos}); + } + + void refresh_next_event() { + while (!event_queue.empty()) { + const JobEvent next = event_queue.top(); + const BatchJobState& job = jobs[next.job_pos]; + if (job.done || job.next_event_t != next.t) { + event_queue.pop(); + continue; + } + next_event = next.t; + return; + } + next_event = std::numeric_limits::max(); + stopped.store(true); + } + + void spawn_finalize_job(BatchJobState& job) { + job.done = true; + job.next_checkpoint_t = std::numeric_limits::max(); + job.next_event_t = std::numeric_limits::max(); + + finalizer_latch->add_task(); + GlobalBatchFinalizerPool::instance().enqueue(BatchFinalizeTask( + job.index, + d_bits, + out_arrays, + std::move(job.y_ref), + std::move(job.buckets), + finalizer_latch)); + } + + const integer& shared_D; + const integer& shared_L; + int d_bits; + ChiavdfByteArray* out_arrays; + size_t job_count; + std::atomic& stopped; + std::vector jobs; + std::shared_ptr finalizer_latch; + std::priority_queue, JobEventGreater> event_queue; + uint64_t next_event = std::numeric_limits::max(); + uint64_t progress_interval; + ChiavdfProgressCallback progress_cb; + void* progress_user_data; + uint64_t next_progress; + bool fatal_error = false; +}; + +ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_impl( + const uint8_t* challenge_hash, + size_t challenge_size, + const uint8_t* x_s, + size_t x_s_size, + const uint8_t* y_ref_s, + size_t y_ref_s_size, + size_t discriminant_size_bits, + uint64_t num_iterations, + uint64_t progress_interval, + ChiavdfProgressCallback progress_cb, + void* progress_user_data, + bool use_getblock_opt) { + std::call_once(init_once, init_chiavdf_fast); + + last_streaming_stats = LastStreamingStats{}; + + if (challenge_hash == nullptr || challenge_size == 0 || x_s == nullptr || x_s_size == 0 || + y_ref_s == nullptr || y_ref_s_size == 0) { + return empty_result(); + } + if (num_iterations == 0) { + return empty_result(); + } + + std::vector challenge_hash_bytes(challenge_hash, challenge_hash + challenge_size); + integer D = CreateDiscriminant(challenge_hash_bytes, static_cast(discriminant_size_bits)); + integer L = root(-D, 4); + + form x = DeserializeForm(D, x_s, x_s_size); + form y_ref = DeserializeForm(D, y_ref_s, y_ref_s_size); + + uint32_t k; + uint32_t l; + bool tuned = false; + const uint64_t budget = + bucket_memory_budget_bytes.load(std::memory_order_relaxed); + if (num_iterations >= (1 << 16)) { + tuned = tune_streaming_parameters(num_iterations, discriminant_size_bits, budget, l, k); + } + if (!tuned) { + if (num_iterations >= (1 << 16)) { + ApproximateParameters(num_iterations, l, k); + } else { + k = 10; + l = 1; + } + } + if (k == 0) { + k = 1; + } + if (l == 0) { + l = 1; + } + uint64_t ignored_bucket_span = 0; + if (!try_pow2_u64_shift(k, ignored_bucket_span)) { + return empty_result(); + } + + last_streaming_parameters.k = k; + last_streaming_parameters.l = l; + last_streaming_parameters.tuned = tuned; + last_streaming_parameters.set = true; + + uint64_t kl = static_cast(k) * static_cast(l); + uint64_t limit = num_iterations / kl; + if (num_iterations % kl) { + limit++; + } + + integer B = GetB(D, x, y_ref); + + std::atomic stopped(false); + StreamingOneWesolowskiCallback weso( + D, + num_iterations, + k, + l, + limit, + std::move(B), + use_getblock_opt, + progress_interval, + progress_cb, + progress_user_data); + + if (!weso.init_ok()) { + return empty_result(); + } + + weso.process_checkpoint(/*i=*/0, x, /*record_stats=*/false); + + FastStorage* fast_storage = nullptr; + repeated_square(num_iterations, x, D, L, &weso, fast_storage, stopped); + + if (!weso.ok()) { + return empty_result(); + } + if (!(weso.y() == y_ref)) { + return empty_result(); + } + + form proof_form = weso.finalize_proof(); + + if (weso.stats_ok()) { + last_streaming_stats = weso.stats(); + } + + int d_bits = D.num_bits(); + std::vector y_serialized = SerializeForm(y_ref, d_bits); + std::vector proof_serialized = SerializeForm(proof_form, d_bits); + + if (y_serialized.empty() || proof_serialized.empty()) { + return empty_result(); + } + + const size_t total = y_serialized.size() + proof_serialized.size(); + uint8_t* out = new uint8_t[total]; + std::copy(y_serialized.begin(), y_serialized.end(), out); + std::copy(proof_serialized.begin(), proof_serialized.end(), out + y_serialized.size()); + return ChiavdfByteArray{out, total}; +} +} // namespace + +extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast( + const uint8_t* challenge_hash, + size_t challenge_size, + const uint8_t* x_s, + size_t x_s_size, + size_t discriminant_size_bits, + uint64_t num_iterations) { + return chiavdf_prove_one_weso_fast_with_progress( + challenge_hash, + challenge_size, + x_s, + x_s_size, + discriminant_size_bits, + num_iterations, + /*progress_interval=*/0, + /*progress_cb=*/nullptr, + /*progress_user_data=*/nullptr); +} + +extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_with_progress( + const uint8_t* challenge_hash, + size_t challenge_size, + const uint8_t* x_s, + size_t x_s_size, + size_t discriminant_size_bits, + uint64_t num_iterations, + uint64_t progress_interval, + ChiavdfProgressCallback progress_cb, + void* progress_user_data) { + try { + std::call_once(init_once, init_chiavdf_fast); + + if (challenge_hash == nullptr || challenge_size == 0 || x_s == nullptr || x_s_size == 0) { + return empty_result(); + } + if (num_iterations == 0) { + return empty_result(); + } + + std::vector challenge_hash_bytes(challenge_hash, challenge_hash + challenge_size); + integer D = CreateDiscriminant(challenge_hash_bytes, static_cast(discriminant_size_bits)); + integer L = root(-D, 4); + + form x = DeserializeForm(D, x_s, x_s_size); + + std::atomic stopped(false); + ProgressOneWesolowskiCallback weso( + D, + x, + num_iterations, + progress_interval, + progress_cb, + progress_user_data); + + // Run the fast repeated-squaring engine to `num_iterations`. + // The callback stores all intermediates needed for the proof. + FastStorage* fast_storage = nullptr; + repeated_square(num_iterations, x, D, L, &weso, fast_storage, stopped); + + // Now generate the compact proof from the stored intermediates. + Proof proof = ProveOneWesolowski(num_iterations, D, x, &weso, stopped); + if (proof.y.empty() || proof.proof.empty()) { + return empty_result(); + } + + const size_t total = proof.y.size() + proof.proof.size(); + uint8_t* out = new uint8_t[total]; + std::copy(proof.y.begin(), proof.y.end(), out); + std::copy(proof.proof.begin(), proof.proof.end(), out + proof.y.size()); + return ChiavdfByteArray{out, total}; + } catch (...) { + return empty_result(); + } +} + +extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming( + const uint8_t* challenge_hash, + size_t challenge_size, + const uint8_t* x_s, + size_t x_s_size, + const uint8_t* y_ref_s, + size_t y_ref_s_size, + size_t discriminant_size_bits, + uint64_t num_iterations) { + return chiavdf_prove_one_weso_fast_streaming_with_progress( + challenge_hash, + challenge_size, + x_s, + x_s_size, + y_ref_s, + y_ref_s_size, + discriminant_size_bits, + num_iterations, + /*progress_interval=*/0, + /*progress_cb=*/nullptr, + /*progress_user_data=*/nullptr); +} + +extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_with_progress( + const uint8_t* challenge_hash, + size_t challenge_size, + const uint8_t* x_s, + size_t x_s_size, + const uint8_t* y_ref_s, + size_t y_ref_s_size, + size_t discriminant_size_bits, + uint64_t num_iterations, + uint64_t progress_interval, + ChiavdfProgressCallback progress_cb, + void* progress_user_data) { + try { + return chiavdf_prove_one_weso_fast_streaming_impl( + challenge_hash, + challenge_size, + x_s, + x_s_size, + y_ref_s, + y_ref_s_size, + discriminant_size_bits, + num_iterations, + progress_interval, + progress_cb, + progress_user_data, + /*use_getblock_opt=*/false); + } catch (...) { + return empty_result(); + } +} + +extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt( + const uint8_t* challenge_hash, + size_t challenge_size, + const uint8_t* x_s, + size_t x_s_size, + const uint8_t* y_ref_s, + size_t y_ref_s_size, + size_t discriminant_size_bits, + uint64_t num_iterations) { + return chiavdf_prove_one_weso_fast_streaming_getblock_opt_with_progress( + challenge_hash, + challenge_size, + x_s, + x_s_size, + y_ref_s, + y_ref_s_size, + discriminant_size_bits, + num_iterations, + /*progress_interval=*/0, + /*progress_cb=*/nullptr, + /*progress_user_data=*/nullptr); +} + +extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt_with_progress( + const uint8_t* challenge_hash, + size_t challenge_size, + const uint8_t* x_s, + size_t x_s_size, + const uint8_t* y_ref_s, + size_t y_ref_s_size, + size_t discriminant_size_bits, + uint64_t num_iterations, + uint64_t progress_interval, + ChiavdfProgressCallback progress_cb, + void* progress_user_data) { + try { + return chiavdf_prove_one_weso_fast_streaming_impl( + challenge_hash, + challenge_size, + x_s, + x_s_size, + y_ref_s, + y_ref_s_size, + discriminant_size_bits, + num_iterations, + progress_interval, + progress_cb, + progress_user_data, + /*use_getblock_opt=*/true); + } catch (...) { + return empty_result(); + } +} + +extern "C" void chiavdf_set_bucket_memory_budget_bytes(uint64_t bytes) { + bucket_memory_budget_bytes.store(bytes, std::memory_order_relaxed); +} + +extern "C" void chiavdf_set_enable_streaming_stats(bool enable) { + streaming_stats_enabled.store(enable, std::memory_order_relaxed); + last_streaming_stats = LastStreamingStats{}; +} + +extern "C" bool chiavdf_get_last_streaming_parameters(uint32_t* out_k, uint32_t* out_l, bool* out_tuned) { + if (out_k == nullptr || out_l == nullptr || out_tuned == nullptr) { + return false; + } + if (!last_streaming_parameters.set) { + return false; + } + *out_k = last_streaming_parameters.k; + *out_l = last_streaming_parameters.l; + *out_tuned = last_streaming_parameters.tuned; + return true; +} + +extern "C" bool chiavdf_get_last_streaming_stats( + uint64_t* out_checkpoint_total_ns, + uint64_t* out_checkpoint_event_total_ns, + uint64_t* out_finalize_total_ns, + uint64_t* out_checkpoint_calls, + uint64_t* out_bucket_updates) { + if (out_checkpoint_total_ns == nullptr || out_checkpoint_event_total_ns == nullptr || + out_finalize_total_ns == nullptr || out_checkpoint_calls == nullptr || + out_bucket_updates == nullptr) { + return false; + } + if (!last_streaming_stats.set) { + return false; + } + *out_checkpoint_total_ns = last_streaming_stats.checkpoint_total_ns; + *out_checkpoint_event_total_ns = last_streaming_stats.checkpoint_event_total_ns; + *out_finalize_total_ns = last_streaming_stats.finalize_total_ns; + *out_checkpoint_calls = last_streaming_stats.checkpoint_calls; + *out_bucket_updates = last_streaming_stats.bucket_updates; + return true; +} + +extern "C" ChiavdfByteArray* chiavdf_prove_one_weso_fast_streaming_getblock_opt_batch( + const uint8_t* challenge_hash, + size_t challenge_size, + const uint8_t* x_s, + size_t x_s_size, + size_t discriminant_size_bits, + const ChiavdfBatchJob* jobs, + size_t job_count) { + return chiavdf_prove_one_weso_fast_streaming_getblock_opt_batch_with_progress( + challenge_hash, + challenge_size, + x_s, + x_s_size, + discriminant_size_bits, + jobs, + job_count, + /*progress_interval=*/0, + /*progress_cb=*/nullptr, + /*progress_user_data=*/nullptr); +} + +extern "C" ChiavdfByteArray* chiavdf_prove_one_weso_fast_streaming_getblock_opt_batch_with_progress( + const uint8_t* challenge_hash, + size_t challenge_size, + const uint8_t* x_s, + size_t x_s_size, + size_t discriminant_size_bits, + const ChiavdfBatchJob* jobs, + size_t job_count, + uint64_t progress_interval, + ChiavdfProgressCallback progress_cb, + void* progress_user_data) { + ChiavdfByteArray* out_arrays = nullptr; + std::unique_ptr weso; + bool finalizers_joined = false; + integer D; + integer L; + std::atomic stopped(false); + try { + std::call_once(init_once, init_chiavdf_fast); + + if (challenge_hash == nullptr || challenge_size == 0 || x_s == nullptr || x_s_size == 0 || + jobs == nullptr || job_count == 0 || discriminant_size_bits == 0) { + return nullptr; + } + + for (size_t idx = 0; idx < job_count; idx++) { + if (jobs[idx].y_ref_s == nullptr || jobs[idx].y_ref_s_size == 0 || + jobs[idx].num_iterations == 0) { + return nullptr; + } + } + + std::vector challenge_hash_bytes(challenge_hash, challenge_hash + challenge_size); + D = CreateDiscriminant(challenge_hash_bytes, static_cast(discriminant_size_bits)); + L = root(-D, 4); + + form x0 = DeserializeForm(D, x_s, x_s_size); + + int d_bits = D.num_bits(); + + out_arrays = new ChiavdfByteArray[job_count](); + + uint64_t t_max = 0; + std::vector job_states; + job_states.reserve(job_count); + + const uint64_t budget = bucket_memory_budget_bytes.load(std::memory_order_relaxed); + const uint64_t per_job_budget = (budget == 0 || job_count == 0) + ? budget + : (budget / static_cast(job_count)); + + for (size_t idx = 0; idx < job_count; idx++) { + uint64_t num_iterations = jobs[idx].num_iterations; + t_max = std::max(t_max, num_iterations); + + form y_ref = DeserializeForm(D, jobs[idx].y_ref_s, jobs[idx].y_ref_s_size); + + uint32_t k; + uint32_t l; + bool tuned = false; + if (num_iterations >= (1 << 16)) { + tuned = tune_streaming_parameters( + num_iterations, + discriminant_size_bits, + per_job_budget, + l, + k); + } + if (!tuned) { + if (num_iterations >= (1 << 16)) { + ApproximateParameters(num_iterations, l, k); + } else { + k = 10; + l = 1; + } + } + if (k == 0) { + k = 1; + } + if (l == 0) { + l = 1; + } + + uint64_t kl = static_cast(k) * static_cast(l); + uint64_t limit = num_iterations / kl; + if (num_iterations % kl) { + limit++; + } + + integer B = GetB(D, x0, y_ref); + + StreamingWesolowskiBuckets buckets( + D, + L, + num_iterations, + k, + l, + limit, + std::move(B), + /*use_getblock_opt=*/true); + + if (!buckets.init_ok()) { + chiavdf_free_byte_array_batch(out_arrays, job_count); + return nullptr; + } + + job_states.emplace_back( + idx, + num_iterations, + k, + l, + limit, + std::move(y_ref), + std::move(buckets)); + } + + weso = std::make_unique( + D, + D, + L, + d_bits, + out_arrays, + job_count, + stopped, + std::move(job_states), + progress_interval, + progress_cb, + progress_user_data); + weso->initialize(x0); + + FastStorage* fast_storage = nullptr; + repeated_square(t_max, x0, D, L, weso.get(), fast_storage, stopped); + + weso->join_finalizers(); + finalizers_joined = true; + + if (!weso->ok()) { + chiavdf_free_byte_array_batch(out_arrays, job_count); + return nullptr; + } + + return out_arrays; + } catch (...) { + if (weso != nullptr && !finalizers_joined) { + try { + weso->join_finalizers(); + } catch (...) { + } + } + chiavdf_free_byte_array_batch(out_arrays, job_count); + return nullptr; + } +} + +extern "C" void chiavdf_free_byte_array_batch(ChiavdfByteArray* arrays, size_t count) { + if (arrays == nullptr) { + return; + } + for (size_t idx = 0; idx < count; idx++) { + delete[] arrays[idx].data; + arrays[idx] = empty_result(); + } + delete[] arrays; +} + +extern "C" void chiavdf_free_byte_array(ChiavdfByteArray array) { delete[] array.data; } diff --git a/src/c_bindings/fast_wrapper.h b/src/c_bindings/fast_wrapper.h new file mode 100644 index 00000000..a83bd746 --- /dev/null +++ b/src/c_bindings/fast_wrapper.h @@ -0,0 +1,197 @@ +#pragma once + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + uint8_t* data; + size_t length; +} ChiavdfByteArray; + +typedef struct { + const uint8_t* y_ref_s; + size_t y_ref_s_size; + uint64_t num_iterations; +} ChiavdfBatchJob; + +typedef void (*ChiavdfProgressCallback)(uint64_t iters_done, void* user_data); + +// Configure the per-process memory budget used by the parameter tuner when +// selecting `(k,l)` for streaming/bucket-based proving. +// +// The budget is per worker process (not global across multiple processes). +// +// If `bytes` is 0, the default chiavdf heuristic is used. +void chiavdf_set_bucket_memory_budget_bytes(uint64_t bytes); + +// Debug helper: returns the `(k,l)` parameters selected for the most recent +// streaming proof computed on the current thread. +// +// Returns true if parameters are available. +bool chiavdf_get_last_streaming_parameters(uint32_t* out_k, uint32_t* out_l, bool* out_tuned); + +// Enable lightweight timing counters for the streaming prover. +// +// When enabled, the native library records basic timing counters for the most +// recent streaming proof computed on the current thread. This is intended for +// benchmarking and tuning; production runs should keep this disabled to avoid +// extra overhead. +void chiavdf_set_enable_streaming_stats(bool enable); + +// Debug helper: returns timing counters for the most recent streaming proof on +// the current thread. +// +// Returns true if stats are available (i.e. stats enabled and a streaming proof +// was computed successfully). +bool chiavdf_get_last_streaming_stats( + uint64_t* out_checkpoint_total_ns, + uint64_t* out_checkpoint_event_total_ns, + uint64_t* out_finalize_total_ns, + uint64_t* out_checkpoint_calls, + uint64_t* out_bucket_updates); + +typedef struct { + const uint8_t* y_ref_s; + size_t y_ref_s_size; + uint64_t num_iterations; +} ChiavdfBatchJob; + +// Computes a compact (witness_type=0) Wesolowski proof using the fast engine. +// +// On success, returns `y || proof` where: +// - `y` is the serialized output form (typically 100 bytes for 1024-bit discriminants) +// - `proof` is the serialized witness form (same size as `y`) +// +// On failure, returns `{NULL, 0}`. +ChiavdfByteArray chiavdf_prove_one_weso_fast( + const uint8_t* challenge_hash, + size_t challenge_size, + const uint8_t* x_s, + size_t x_s_size, + size_t discriminant_size_bits, + uint64_t num_iterations); + +// Same as `chiavdf_prove_one_weso_fast`, but optionally invokes `progress_cb` from +// the proving thread every `progress_interval` iterations completed. +// +// If `progress_cb` is NULL or `progress_interval` is 0, no progress is reported. +ChiavdfByteArray chiavdf_prove_one_weso_fast_with_progress( + const uint8_t* challenge_hash, + size_t challenge_size, + const uint8_t* x_s, + size_t x_s_size, + size_t discriminant_size_bits, + uint64_t num_iterations, + uint64_t progress_interval, + ChiavdfProgressCallback progress_cb, + void* progress_user_data); + +// Computes a compact (witness_type=0) Wesolowski proof using the "streaming" +// bucket-accumulation algorithm (Trick 1), which requires the expected output +// `y_ref` up front (as used by bluebox compaction jobs). +// +// On success, returns `y || proof` (same format as `chiavdf_prove_one_weso_fast`). +ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming( + const uint8_t* challenge_hash, + size_t challenge_size, + const uint8_t* x_s, + size_t x_s_size, + const uint8_t* y_ref_s, + size_t y_ref_s_size, + size_t discriminant_size_bits, + uint64_t num_iterations); + +// Same as `chiavdf_prove_one_weso_fast_streaming`, but optionally invokes +// `progress_cb` from the proving thread every `progress_interval` iterations. +ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_with_progress( + const uint8_t* challenge_hash, + size_t challenge_size, + const uint8_t* x_s, + size_t x_s_size, + const uint8_t* y_ref_s, + size_t y_ref_s_size, + size_t discriminant_size_bits, + uint64_t num_iterations, + uint64_t progress_interval, + ChiavdfProgressCallback progress_cb, + void* progress_user_data); + +// Same as `chiavdf_prove_one_weso_fast_streaming`, but with an optimized +// implementation of the `GetBlock()` mapping (avoids per-block modular +// exponentiation without allocating a full `GetBlock` table). +ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt( + const uint8_t* challenge_hash, + size_t challenge_size, + const uint8_t* x_s, + size_t x_s_size, + const uint8_t* y_ref_s, + size_t y_ref_s_size, + size_t discriminant_size_bits, + uint64_t num_iterations); + +// Same as `chiavdf_prove_one_weso_fast_streaming_getblock_opt`, but optionally +// invokes `progress_cb` from the proving thread every `progress_interval` +// iterations. +ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt_with_progress( + const uint8_t* challenge_hash, + size_t challenge_size, + const uint8_t* x_s, + size_t x_s_size, + const uint8_t* y_ref_s, + size_t y_ref_s_size, + size_t discriminant_size_bits, + uint64_t num_iterations, + uint64_t progress_interval, + ChiavdfProgressCallback progress_cb, + void* progress_user_data); + +// Computes multiple compact (witness_type=0) Wesolowski proofs in one shared +// squaring run ("Trick 2"), using the streaming algorithm (Trick 1) and the +// GetBlock precomputation optimization. +// +// All jobs in the batch must share the same: +// - `challenge_hash` +// - `x_s` (input form bytes) +// - `discriminant_size_bits` +// +// Returns an array of `job_count` byte arrays, each containing `y || proof` on +// success. The caller must free the returned array using +// `chiavdf_free_byte_array_batch(...)`. +// +// On fatal error (including output mismatch), returns NULL. +ChiavdfByteArray* chiavdf_prove_one_weso_fast_streaming_getblock_opt_batch( + const uint8_t* challenge_hash, + size_t challenge_size, + const uint8_t* x_s, + size_t x_s_size, + size_t discriminant_size_bits, + const ChiavdfBatchJob* jobs, + size_t job_count); + +// Same as `chiavdf_prove_one_weso_fast_streaming_getblock_opt_batch`, but +// optionally invokes `progress_cb` from the proving thread every +// `progress_interval` squaring iterations completed. +ChiavdfByteArray* chiavdf_prove_one_weso_fast_streaming_getblock_opt_batch_with_progress( + const uint8_t* challenge_hash, + size_t challenge_size, + const uint8_t* x_s, + size_t x_s_size, + size_t discriminant_size_bits, + const ChiavdfBatchJob* jobs, + size_t job_count, + uint64_t progress_interval, + ChiavdfProgressCallback progress_cb, + void* progress_user_data); + +void chiavdf_free_byte_array_batch(ChiavdfByteArray* arrays, size_t count); + +void chiavdf_free_byte_array(ChiavdfByteArray array); + +#ifdef __cplusplus +} +#endif diff --git a/src/callback.h b/src/callback.h index f4764bbf..9ebf3543 100644 --- a/src/callback.h +++ b/src/callback.h @@ -73,6 +73,14 @@ class WesolowskiCallback :public INUDUPLListener { } virtual void OnIteration(int type, void *data, uint64_t iteration) = 0; + virtual void OnBatchStart(uint64_t base_iteration, uint64_t batch_size) { + (void)base_iteration; + (void)batch_size; + } + virtual void OnBatchReplay(uint64_t base_iteration, uint64_t batch_size) { + (void)base_iteration; + (void)batch_size; + } std::unique_ptr forms; size_t forms_capacity = 0; diff --git a/src/threading.h b/src/threading.h index 3244b3c3..3574a98f 100644 --- a/src/threading.h +++ b/src/threading.h @@ -566,8 +566,14 @@ struct alignas(64) thread_counter { } }; -thread_counter master_counter[100]; -thread_counter slave_counter[100]; +#ifndef CHIA_VDF_FAST_COUNTER_SLOTS +#define CHIA_VDF_FAST_COUNTER_SLOTS 100 +#endif + +static_assert(CHIA_VDF_FAST_COUNTER_SLOTS > 0, "CHIA_VDF_FAST_COUNTER_SLOTS must be > 0"); + +thread_counter master_counter[CHIA_VDF_FAST_COUNTER_SLOTS]; +thread_counter slave_counter[CHIA_VDF_FAST_COUNTER_SLOTS]; struct thread_state { int pairindex; diff --git a/src/vdf.h b/src/vdf.h index 7bb911f9..92a56b78 100644 --- a/src/vdf.h +++ b/src/vdf.h @@ -87,6 +87,63 @@ std::mutex new_event_mutex, cout_lock; bool debug_mode = false; bool fast_algorithm = false; bool two_weso = false; +bool quiet_mode = false; + +// vdf_fast uses shared master/slave counters keyed by `square_state.pairindex`. +// The upstream chiavdf binaries run one VDF per process and hardcode `pairindex=0`. +// In embedded/multi-worker setups (like WesoForge), multiple VDF computations can +// run concurrently in the same process; they must not share a pairindex. +#if (defined(ARCH_X86) || defined(ARCH_X64)) && !defined(CHIA_DISABLE_ASM) +inline std::atomic vdf_fast_next_slot{0}; +#endif + +inline int vdf_fast_pairindex() { +#if (defined(ARCH_X86) || defined(ARCH_X64)) && !defined(CHIA_DISABLE_ASM) + constexpr unsigned int kSlots = unsigned(sizeof(master_counter) / sizeof(master_counter[0])); + static_assert(kSlots > 0, "CHIA_VDF_FAST_COUNTER_SLOTS must be > 0"); + static std::array, kSlots> vdf_fast_slot_in_use{}; + struct SlotLease { + std::array, kSlots>* slots = nullptr; + int slot = -1; + bool owns_slot = false; + ~SlotLease() { + if (owns_slot && slots != nullptr && slot >= 0) { + (*slots)[static_cast(slot)].store(false, std::memory_order_release); + } + } + }; + + thread_local SlotLease lease; + if (lease.slot >= 0) { + return lease.slot; + } + + lease.slots = &vdf_fast_slot_in_use; + + const unsigned int start = vdf_fast_next_slot.fetch_add(1u, std::memory_order_relaxed); + for (unsigned int i = 0; i < kSlots; i++) { + const unsigned int candidate = (start + i) % kSlots; + bool expected = false; + if (vdf_fast_slot_in_use[candidate].compare_exchange_strong( + expected, + true, + std::memory_order_acq_rel, + std::memory_order_relaxed)) { + lease.slot = static_cast(candidate); + lease.owns_slot = true; + return lease.slot; + } + } + + // All slots are currently active. Reuse one as a best-effort fallback; the + // fast path has corruption detection and can fall back to slow squaring. + lease.slot = static_cast(start % kSlots); + lease.owns_slot = false; + return lease.slot; +#else + return 0; +#endif +} //always works void repeated_square_original(vdf_original &vdfo, form& f, const integer&, const integer&, uint64 base, uint64 iterations, INUDUPLListener *nuduplListener) { @@ -185,6 +242,9 @@ void repeated_square(uint64_t iterations, form f, const integer& D, const intege #endif uint64 batch_size=c_checkpoint_interval; + if (weso != NULL) { + weso->OnBatchStart(num_iterations, batch_size); + } #ifdef ENABLE_TRACK_CYCLES print( "track cycles enabled; results will be wrong" ); @@ -195,7 +255,7 @@ void repeated_square(uint64_t iterations, form f, const integer& D, const intege #if (defined(ARCH_X86) || defined(ARCH_X64)) && !defined(CHIA_DISABLE_ASM) // x86/x64: use the phased pipeline. square_state_type square_state; - square_state.pairindex = 0; + square_state.pairindex = vdf_fast_pairindex(); actual_iterations = repeated_square_fast(square_state, f, D, L, num_iterations, batch_size, weso); #else // Non-x86: use the C++ NUDUPL path (faster and lower maintenance than the phased pipeline). @@ -215,6 +275,9 @@ void repeated_square(uint64_t iterations, form f, const integer& D, const intege if (actual_iterations==~uint64(0)) { //corruption; f is unchanged. do the entire batch with the slow algorithm + if (weso != NULL) { + weso->OnBatchReplay(num_iterations, batch_size); + } repeated_square_original(*weso->vdfo, f, D, L, num_iterations, batch_size, weso); actual_iterations=batch_size; @@ -298,10 +361,12 @@ void repeated_square(uint64_t iterations, form f, const integer& D, const intege } #endif } - { - // this shouldn't be needed but avoids some false positive in TSAN - std::lock_guard lk(cout_lock); - std::cout << "VDF loop finished. Total iters: " << num_iterations << "\n" << std::flush; + if (!quiet_mode) { + { + // this shouldn't be needed but avoids some false positive in TSAN + std::lock_guard lk(cout_lock); + std::cout << "VDF loop finished. Total iters: " << num_iterations << "\n" << std::flush; + } } #ifdef VDF_TEST @@ -337,8 +402,9 @@ Proof ProveOneWesolowski(uint64_t iters, integer& D, form f, OneWesolowskiCallba proof_serialized = SerializeForm(proof_form, d_bits); Proof proof(y_serialized, proof_serialized); proof.witness_type = 0; - { - // this shouldn't be needed but avoids some false positive in TSAN + if (!quiet_mode) { + // Keep proof diagnostics available for vdf_client while quiet_mode + // suppresses output in embedded library-mode call paths. std::lock_guard lk(cout_lock); std::cout << "Got simple weso proof: " << proof.hex() << "\n"; }