From 7847c67b743c1f1f6c9cf218c57e8163f61816be Mon Sep 17 00:00:00 2001 From: Gene Hoffman Date: Mon, 23 Feb 2026 23:30:52 -0800 Subject: [PATCH 01/13] Add streaming one-wesolowski compaction APIs. Introduce a fast C wrapper with streaming proof generation, incremental GetBlock optimization, and memory-budgeted (k,l) tuning, plus the minimal runtime/build infrastructure needed to embed chiavdf in multi-worker clients. Co-authored-by: Cursor --- docs/bluebox_compaction.md | 49 ++ src/Makefile.vdf-client | 39 +- src/c_bindings/fast_wrapper.cpp | 795 ++++++++++++++++++++++++++++++++ src/c_bindings/fast_wrapper.h | 145 ++++++ src/threading.h | 4 +- src/vdf.h | 29 +- 6 files changed, 1044 insertions(+), 17 deletions(-) create mode 100644 docs/bluebox_compaction.md create mode 100644 src/c_bindings/fast_wrapper.cpp create mode 100644 src/c_bindings/fast_wrapper.h diff --git a/docs/bluebox_compaction.md b/docs/bluebox_compaction.md new file mode 100644 index 00000000..61cd1fd4 --- /dev/null +++ b/docs/bluebox_compaction.md @@ -0,0 +1,49 @@ +# Bluebox Compaction Optimizations + +This document describes the compaction-oriented proving path exposed by +`src/c_bindings/fast_wrapper.h` and implemented in +`src/c_bindings/fast_wrapper.cpp`. + +## Scope + +These APIs are intended for workloads where the expected VDF output (`y_ref`) is +already known up front (for example, bluebox compaction jobs). They are additive +and do not change the existing `c_wrapper` APIs. + +## Optimization 1: Streaming one-wesolowski + +Given `y_ref`, the prover computes: + +- `B = GetB(D, x, y_ref)` before squaring starts + +This enables a streaming algorithm that updates proof buckets at each +checkpoint during repeated squaring, instead of materializing the full +intermediate checkpoint array and scanning it after the loop. In practice this +substantially reduces memory usage for compaction workloads. + +## Optimization 2: Incremental GetBlock mapping + +For streaming checkpoint updates, bucket index selection repeatedly calls +`GetBlock(p, k, T, B)`. The optimized mode keeps a rolling modular state and +advances sequential `p` values incrementally, avoiding full modular +exponentiation per call and avoiding a large lookup table. + +## Optimization 3: Memory-budgeted (k, l) tuning + +The wrapper can tune `(k, l)` under a configured memory budget: + +- `chiavdf_set_bucket_memory_budget_bytes(...)` + +If no tuned candidate is found, the code falls back to the standard parameter +heuristics. + +## Operational Notes + +- The `fast_wrapper` code path sets one-wesolowski mode and uses `quiet_mode` to + avoid unsolicited stdout noise when embedded in multi-worker clients. +- Thread-slot assignment for the fast VDF counters is per-thread via + `vdf_fast_pairindex()`, avoiding slot collisions when multiple VDF computations + run in one process. +- The production default for `enable_threads` in `parameters.h` is unchanged from + upstream to preserve timelord expectations. + diff --git a/src/Makefile.vdf-client b/src/Makefile.vdf-client index 59fcbb63..0fe2380a 100644 --- a/src/Makefile.vdf-client +++ b/src/Makefile.vdf-client @@ -26,15 +26,26 @@ ifeq ($(UNAME),Darwin) NOPIE = endif -CFLAGS += $(LTO_FLAGS) $(NOPIE) -LDFLAGS += $(LTO_FLAGS) $(NOPIE) -g +# Optional: set `PIC=1` to build position-independent objects. +PIC ?= 0 +ifeq ($(PIC),1) +PICFLAGS = -fPIC +PIEFLAGS = +else +PICFLAGS = +PIEFLAGS = $(NOPIE) +endif + +CFLAGS += $(LTO_FLAGS) $(PIEFLAGS) $(PICFLAGS) +LDFLAGS += $(LTO_FLAGS) $(PIEFLAGS) -g ifeq ($(OS),Windows_NT) LDLIBS += -lmpirxx -lmpir -lws2_32 -CXXFLAGS += $(LTO_FLAGS) -std=c++1z -D VDF_MODE=0 -D FAST_MACHINE=1 $(NOPIE) -fvisibility=hidden +CXXFLAGS += $(LTO_FLAGS) -std=c++1z -D VDF_MODE=0 -D FAST_MACHINE=1 $(PIEFLAGS) $(PICFLAGS) -fvisibility=hidden else LDLIBS += -lgmpxx -lgmp -pthread -CXXFLAGS += $(LTO_FLAGS) -std=c++1z -D VDF_MODE=0 -D FAST_MACHINE=1 -pthread $(NOPIE) -fvisibility=hidden +CXXFLAGS += $(LTO_FLAGS) -std=c++1z -D VDF_MODE=0 -D FAST_MACHINE=1 -pthread $(PIEFLAGS) $(PICFLAGS) -fvisibility=hidden endif +ASFLAGS += $(PICFLAGS) ifeq ($(UNAME),Darwin) CXXFLAGS += -D CHIAOSX=1 # Homebrew (common on macOS) installs boost/gmp to /opt/homebrew or /usr/local @@ -81,7 +92,7 @@ BINS = vdf_client prover_test 1weso_test 2weso_test vdf_bench all: $(BINS) clean: - rm -f *.o hw/*.o $(BINS) compile_asm emu_hw_test hw_test hw_vdf_client emu_hw_vdf_client + rm -f *.o hw/*.o c_bindings/*.o $(BINS) compile_asm emu_hw_test hw_test hw_vdf_client emu_hw_vdf_client libchiavdf_fastc.a $(BINS) avx512_test: %: %.o lzcnt.o $(ASM_OBJS) $(CXX) $(LDFLAGS) -o $@ $^ $(LDLIBS) @@ -91,6 +102,9 @@ $(addsuffix .o,$(BINS)) avx512_test.o: CXXFLAGS += $(OPT_CFLAGS) lzcnt.o: refcode/lzcnt.c $(CC) $(CFLAGS) -c refcode/lzcnt.c +%.o: %.s + $(CC) -c $< -o $@ $(ASFLAGS) + asm_compiled.s: compile_asm ./compile_asm @@ -104,6 +118,21 @@ compile_asm: compile_asm.o $(CXX) $(LDFLAGS) -o $@ $^ $(LDLIBS) HW_OBJS = $(addprefix hw/,hw_util.o hw_proof.o hw_interface.o chia_driver.o ftdi_driver.o vdf_driver.o pll_freqs.o) vdf_base_hw.o vdf_hw_symbol_anchors.o prover_runtime.o lzcnt.o +# --------------------------------------------------------------------------- +# Static library: fast one-wesolowski proof (BBR integration) +# --------------------------------------------------------------------------- + +FASTLIB = libchiavdf_fastc.a +FASTLIB_OBJS = c_bindings/fast_wrapper.o lzcnt.o $(ASM_OBJS) + +.PHONY: fastlib + +fastlib: $(FASTLIB) + +$(FASTLIB): $(FASTLIB_OBJS) + $(AR) rcs $@ $^ + +c_bindings/fast_wrapper.o: CXXFLAGS += $(OPT_CFLAGS) EMU_OBJS = hw/emu_funcs.o hw/emu_runner.o ifeq ($(OS),Windows_NT) HW_LIB = hw/libft4222/libft4222.lib diff --git a/src/c_bindings/fast_wrapper.cpp b/src/c_bindings/fast_wrapper.cpp new file mode 100644 index 00000000..198d0a87 --- /dev/null +++ b/src/c_bindings/fast_wrapper.cpp @@ -0,0 +1,795 @@ +#include "fast_wrapper.h" + +#include +#include +#include +#include +#include + +#include "../vdf.h" +#include "../create_discriminant.h" + +// Runtime configuration knobs required by `parameters.h`. +// These are `extern` variables there, but each binary defines them explicitly. +bool use_divide_table = false; +int gcd_base_bits = 50; +int gcd_128_max_iter = 3; +std::string asmprefix = "cel_"; +bool enable_all_instructions = false; + +namespace { +std::once_flag init_once; +std::atomic bucket_memory_budget_bytes(128ULL * 1024ULL * 1024ULL); +std::atomic streaming_stats_enabled(false); + +struct LastStreamingParameters { + uint32_t k = 0; + uint32_t l = 0; + bool tuned = false; + bool set = false; +}; + +thread_local LastStreamingParameters last_streaming_parameters; + +struct LastStreamingStats { + uint64_t checkpoint_total_ns = 0; + uint64_t checkpoint_event_total_ns = 0; + uint64_t finalize_total_ns = 0; + uint64_t checkpoint_calls = 0; + uint64_t bucket_updates = 0; + bool set = false; +}; + +thread_local LastStreamingStats last_streaming_stats; + +void init_chiavdf_fast() { + init_gmp(); + set_rounding_mode(); + + // Match the vdf_client runtime selection for AVX2. + if (hasAVX2()) { + gcd_base_bits = 63; + gcd_128_max_iter = 2; + } else { + gcd_base_bits = 50; + gcd_128_max_iter = 3; + } + + // Ensure we run the one-wesolowski path by default. + fast_algorithm = false; + two_weso = false; + quiet_mode = true; +} + +ChiavdfByteArray empty_result() { return ChiavdfByteArray{nullptr, 0}; } + +uint64_t estimate_bucket_form_bytes(size_t discriminant_size_bits) { + // Be conservative: class group forms contain 3 GMP-backed integers that + // quickly grow to the discriminant size (or beyond) during NUCOMP. + // + // This estimate is intentionally larger than the raw serialized size to + // avoid picking parameters that risk paging/OOM. + uint64_t discr_bytes = (static_cast(discriminant_size_bits) + 7) / 8; + uint64_t estimate = discr_bytes * 16; + if (estimate < 2048) { + estimate = 2048; + } + return estimate; +} + +bool tune_streaming_parameters( + uint64_t num_iterations, + size_t discriminant_size_bits, + uint64_t memory_budget_bytes, + uint32_t& out_l, + uint32_t& out_k) { + if (memory_budget_bytes == 0) { + return false; + } + + // Keep headroom for GMP scratch allocations and general process overhead. + uint64_t budget = (memory_budget_bytes * 80) / 100; + uint64_t bytes_per_form = estimate_bucket_form_bytes(discriminant_size_bits); + if (budget < bytes_per_form) { + return false; + } + + unsigned __int128 best_cost = std::numeric_limits::max(); + bool found = false; + + // Empirical tuning notes (1024-bit discriminants, AVX2 build): + // - Each bucket update (NUCOMP) and each fold unit is ~5µs. + // - Per-checkpoint event overhead (SetForm + bookkeeping) is ~0.3µs. + // + // So checkpoint counts should be weighted much lower than updates/fold. + constexpr unsigned __int128 update_weight = 16; + constexpr unsigned __int128 fold_weight = 16; + constexpr unsigned __int128 checkpoint_weight = 1; + + // Search a small grid of `(k,l)` values. Higher `k` reduces checkpoint work + // (~T/k) but increases fold work (~l·2^k) and bucket memory (~l·2^k). + for (uint32_t k = 4; k <= 20; k++) { + unsigned __int128 buckets_per_row = static_cast(1) << k; + + for (uint32_t l = 1; l <= 64; l++) { + unsigned __int128 form_count = buckets_per_row * static_cast(l); + unsigned __int128 mem_required = + form_count * static_cast(bytes_per_form); + if (mem_required > static_cast(budget)) { + continue; + } + + unsigned __int128 updates = static_cast( + (num_iterations + static_cast(k) - 1) / static_cast(k)); + uint64_t kl = static_cast(k) * static_cast(l); + unsigned __int128 checkpoints = static_cast( + (num_iterations + kl - 1) / kl); + unsigned __int128 fold = static_cast(l) << (k + 1); + unsigned __int128 cost = + updates * update_weight + checkpoints * checkpoint_weight + fold * fold_weight; + + if (!found || cost < best_cost) { + found = true; + best_cost = cost; + out_k = k; + out_l = l; + } + } + } + + return found; +} + +uint64_t get_block(uint64_t i, uint64_t k, uint64_t T, integer& B) { + integer res = FastPow(2, T - k * (i + 1), B); + mpz_mul_2exp(res.impl, res.impl, k); + res = res / B; + auto res_vector = res.to_vector(); + return res_vector.empty() ? 0 : res_vector[0]; +} + +class ProgressOneWesolowskiCallback final : public OneWesolowskiCallback { + public: + ProgressOneWesolowskiCallback( + integer& D, + form& f, + uint64_t wanted_iter, + uint64_t progress_interval, + ChiavdfProgressCallback progress_cb, + void* progress_user_data) + : OneWesolowskiCallback(D, f, wanted_iter), + progress_interval(progress_interval), + progress_cb(progress_cb), + progress_user_data(progress_user_data), + next_progress(progress_interval) {} + + void OnIteration(int type, void* data, uint64_t iteration) override { + OneWesolowskiCallback::OnIteration(type, data, iteration); + + if (progress_cb == nullptr || progress_interval == 0) { + return; + } + + uint64_t done = iteration + 1; + if (done > wanted_iter) { + return; + } + + if (done >= next_progress) { + progress_cb(next_progress, progress_user_data); + next_progress += progress_interval; + } + } + + private: + uint64_t progress_interval; + ChiavdfProgressCallback progress_cb; + void* progress_user_data; + uint64_t next_progress; +}; + +class StreamingOneWesolowskiCallback final : public WesolowskiCallback { + public: + StreamingOneWesolowskiCallback( + integer& D, + uint64_t wanted_iter, + uint32_t k, + uint32_t l, + uint64_t limit, + integer& B, + bool use_getblock_opt, + uint64_t progress_interval, + ChiavdfProgressCallback progress_cb, + void* progress_user_data) + : WesolowskiCallback(D), + wanted_iter(wanted_iter), + k(k), + l(l), + kl(static_cast(k) * static_cast(l)), + limit(limit), + B(B), + progress_interval(progress_interval), + progress_cb(progress_cb), + progress_user_data(progress_user_data), + next_progress(progress_interval), + use_getblock_opt(use_getblock_opt), + stats_enabled(streaming_stats_enabled.load(std::memory_order_relaxed)) { + form id = form::identity(D); + buckets.resize(static_cast(l) * (1ULL << k), id); + + if (use_getblock_opt) { + getblock_ok = init_getblock_opt_state(); + } + } + + void OnIteration(int type, void* data, uint64_t iteration) override { + iteration++; + if (iteration > wanted_iter) { + return; + } + + if (progress_cb != nullptr && progress_interval != 0 && iteration >= next_progress) { + progress_cb(next_progress, progress_user_data); + next_progress += progress_interval; + } + + if (iteration % kl == 0) { + uint64_t pos = iteration / kl; + if (pos < limit) { + form checkpoint; + auto started_at = std::chrono::steady_clock::time_point{}; + if (stats_enabled) { + started_at = std::chrono::steady_clock::now(); + } + SetForm(type, data, &checkpoint); + process_checkpoint(pos, checkpoint, /*record_stats=*/true); + if (stats_enabled) { + checkpoint_event_total_ns += static_cast( + std::chrono::duration_cast( + std::chrono::steady_clock::now() - started_at) + .count()); + } + } + } + + if (iteration == wanted_iter) { + SetForm(type, data, &result); + has_result = true; + } + } + + void process_checkpoint(uint64_t i, const form& checkpoint, bool record_stats) { + const bool do_stats = stats_enabled && record_stats; + auto started_at = std::chrono::steady_clock::time_point{}; + if (do_stats) { + started_at = std::chrono::steady_clock::now(); + } + + uint64_t local_updates = 0; + for (uint32_t j = 0; j < l; j++) { + uint64_t p = i * static_cast(l) + static_cast(j); + uint64_t needed = static_cast(k) * (p + 1); + if (wanted_iter < needed) { + break; + } + uint64_t b = use_getblock_opt ? get_block_opt(p) : get_block(p, k, wanted_iter, B); + if (do_stats) { + local_updates++; + } + nucomp_form(bucket(j, b), bucket(j, b), checkpoint, D, L); + } + + if (do_stats) { + checkpoint_calls++; + bucket_updates += local_updates; + checkpoint_total_ns += static_cast( + std::chrono::duration_cast( + std::chrono::steady_clock::now() - started_at) + .count()); + } + } + + bool init_ok() const { return getblock_ok; } + + bool ok() const { return has_result; } + + const form& y() const { return result; } + + form finalize_proof() { + auto started_at = std::chrono::steady_clock::time_point{}; + if (stats_enabled) { + started_at = std::chrono::steady_clock::now(); + } + + PulmarkReducer reducer; + form id = form::identity(D); + + uint64_t k1 = k / 2; + uint64_t k0 = k - k1; + form x = id; + + for (int64_t j = static_cast(l) - 1; j >= 0; j--) { + x = FastPowFormNucomp(x, D, integer(static_cast(1) << k), L, reducer); + + for (uint64_t b1 = 0; b1 < (1ULL << k1); b1++) { + form z = id; + for (uint64_t b0 = 0; b0 < (1ULL << k0); b0++) { + nucomp_form(z, z, bucket(static_cast(j), b1 * (1ULL << k0) + b0), D, L); + } + z = FastPowFormNucomp( + z, + D, + integer(static_cast(b1 * (1ULL << k0))), + L, + reducer); + nucomp_form(x, x, z, D, L); + } + + for (uint64_t b0 = 0; b0 < (1ULL << k0); b0++) { + form z = id; + for (uint64_t b1 = 0; b1 < (1ULL << k1); b1++) { + nucomp_form(z, z, bucket(static_cast(j), b1 * (1ULL << k0) + b0), D, L); + } + z = FastPowFormNucomp(z, D, integer(b0), L, reducer); + nucomp_form(x, x, z, D, L); + } + } + + reducer.reduce(x); + + if (stats_enabled) { + finalize_total_ns += static_cast( + std::chrono::duration_cast( + std::chrono::steady_clock::now() - started_at) + .count()); + } + return x; + } + + bool stats_ok() const { return stats_enabled; } + + LastStreamingStats stats() const { + LastStreamingStats out; + out.checkpoint_total_ns = checkpoint_total_ns; + out.checkpoint_event_total_ns = checkpoint_event_total_ns; + out.finalize_total_ns = finalize_total_ns; + out.checkpoint_calls = checkpoint_calls; + out.bucket_updates = bucket_updates; + out.set = stats_enabled; + return out; + } + + private: + form& bucket(uint32_t j, uint64_t b) { + size_t idx = static_cast(j) * (1ULL << k) + static_cast(b); + return buckets[idx]; + } + + const form& bucket(uint32_t j, uint64_t b) const { + size_t idx = static_cast(j) * (1ULL << k) + static_cast(b); + return buckets[idx]; + } + + uint64_t wanted_iter; + uint32_t k; + uint32_t l; + uint64_t kl; + uint64_t limit; + integer B; + uint64_t progress_interval; + ChiavdfProgressCallback progress_cb; + void* progress_user_data; + uint64_t next_progress; + + std::vector
buckets; + form result; + bool has_result = false; + + bool use_getblock_opt; + bool getblock_ok = true; + uint64_t getblock_next_p = 0; + integer getblock_inv_2k; + integer getblock_r; + integer getblock_tmp; + + bool stats_enabled; + uint64_t checkpoint_total_ns = 0; + uint64_t checkpoint_event_total_ns = 0; + uint64_t finalize_total_ns = 0; + uint64_t checkpoint_calls = 0; + uint64_t bucket_updates = 0; + + bool init_getblock_opt_state() { + if (k == 0) { + return false; + } + uint64_t k_u64 = static_cast(k); + if (wanted_iter < k_u64) { + return true; + } + + integer two_k_mod = FastPow(2, k_u64, B); + if (mpz_invert(getblock_inv_2k.impl, two_k_mod.impl, B.impl) == 0) { + return false; + } + + getblock_r = FastPow(2, wanted_iter - k_u64, B); + getblock_next_p = 0; + return true; + } + + uint64_t get_block_opt(uint64_t p) { + if (!getblock_ok || wanted_iter < static_cast(k)) { + return get_block(p, k, wanted_iter, B); + } + + // Expected call pattern is sequential `p`. If we ever get out of sync, + // advance state forward or fall back to the slow mapping. + if (p < getblock_next_p) { + return get_block(p, k, wanted_iter, B); + } + while (getblock_next_p < p) { + mpz_mul(getblock_r.impl, getblock_r.impl, getblock_inv_2k.impl); + mpz_mod(getblock_r.impl, getblock_r.impl, B.impl); + getblock_next_p++; + } + + mpz_mul_2exp(getblock_tmp.impl, getblock_r.impl, k); + mpz_fdiv_q(getblock_tmp.impl, getblock_tmp.impl, B.impl); + uint64_t b = mpz_get_ui(getblock_tmp.impl); + + mpz_mul(getblock_r.impl, getblock_r.impl, getblock_inv_2k.impl); + mpz_mod(getblock_r.impl, getblock_r.impl, B.impl); + getblock_next_p++; + + return b; + } +}; + +ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_impl( + const uint8_t* challenge_hash, + size_t challenge_size, + const uint8_t* x_s, + size_t x_s_size, + const uint8_t* y_ref_s, + size_t y_ref_s_size, + size_t discriminant_size_bits, + uint64_t num_iterations, + uint64_t progress_interval, + ChiavdfProgressCallback progress_cb, + void* progress_user_data, + bool use_getblock_opt) { + std::call_once(init_once, init_chiavdf_fast); + + last_streaming_stats = LastStreamingStats{}; + + if (challenge_hash == nullptr || challenge_size == 0 || x_s == nullptr || x_s_size == 0 || + y_ref_s == nullptr || y_ref_s_size == 0) { + return empty_result(); + } + if (num_iterations == 0) { + return empty_result(); + } + + std::vector challenge_hash_bytes(challenge_hash, challenge_hash + challenge_size); + integer D = CreateDiscriminant(challenge_hash_bytes, static_cast(discriminant_size_bits)); + integer L = root(-D, 4); + + form x = DeserializeForm(D, x_s, x_s_size); + form y_ref = DeserializeForm(D, y_ref_s, y_ref_s_size); + + uint32_t k; + uint32_t l; + bool tuned = false; + const uint64_t budget = + bucket_memory_budget_bytes.load(std::memory_order_relaxed); + if (num_iterations >= (1 << 16)) { + tuned = tune_streaming_parameters(num_iterations, discriminant_size_bits, budget, l, k); + } + if (!tuned) { + if (num_iterations >= (1 << 16)) { + ApproximateParameters(num_iterations, l, k); + } else { + k = 10; + l = 1; + } + } + if (k == 0) { + k = 1; + } + if (l == 0) { + l = 1; + } + + last_streaming_parameters.k = k; + last_streaming_parameters.l = l; + last_streaming_parameters.tuned = tuned; + last_streaming_parameters.set = true; + + uint64_t kl = static_cast(k) * static_cast(l); + uint64_t limit = num_iterations / kl; + if (num_iterations % kl) { + limit++; + } + + integer B = GetB(D, x, y_ref); + + std::atomic stopped(false); + StreamingOneWesolowskiCallback weso( + D, + num_iterations, + k, + l, + limit, + B, + use_getblock_opt, + progress_interval, + progress_cb, + progress_user_data); + + if (!weso.init_ok()) { + return empty_result(); + } + + weso.process_checkpoint(/*i=*/0, x, /*record_stats=*/false); + + FastStorage* fast_storage = nullptr; + repeated_square(num_iterations, x, D, L, &weso, fast_storage, stopped); + + if (!weso.ok()) { + return empty_result(); + } + if (!(weso.y() == y_ref)) { + return empty_result(); + } + + form proof_form = weso.finalize_proof(); + + if (weso.stats_ok()) { + last_streaming_stats = weso.stats(); + } + + int d_bits = D.num_bits(); + std::vector y_serialized = SerializeForm(y_ref, d_bits); + std::vector proof_serialized = SerializeForm(proof_form, d_bits); + + if (y_serialized.empty() || proof_serialized.empty()) { + return empty_result(); + } + + const size_t total = y_serialized.size() + proof_serialized.size(); + uint8_t* out = new uint8_t[total]; + std::copy(y_serialized.begin(), y_serialized.end(), out); + std::copy(proof_serialized.begin(), proof_serialized.end(), out + y_serialized.size()); + return ChiavdfByteArray{out, total}; +} +} // namespace + +extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast( + const uint8_t* challenge_hash, + size_t challenge_size, + const uint8_t* x_s, + size_t x_s_size, + size_t discriminant_size_bits, + uint64_t num_iterations) { + return chiavdf_prove_one_weso_fast_with_progress( + challenge_hash, + challenge_size, + x_s, + x_s_size, + discriminant_size_bits, + num_iterations, + /*progress_interval=*/0, + /*progress_cb=*/nullptr, + /*progress_user_data=*/nullptr); +} + +extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_with_progress( + const uint8_t* challenge_hash, + size_t challenge_size, + const uint8_t* x_s, + size_t x_s_size, + size_t discriminant_size_bits, + uint64_t num_iterations, + uint64_t progress_interval, + ChiavdfProgressCallback progress_cb, + void* progress_user_data) { + try { + std::call_once(init_once, init_chiavdf_fast); + + if (challenge_hash == nullptr || challenge_size == 0 || x_s == nullptr || x_s_size == 0) { + return empty_result(); + } + if (num_iterations == 0) { + return empty_result(); + } + + std::vector challenge_hash_bytes(challenge_hash, challenge_hash + challenge_size); + integer D = CreateDiscriminant(challenge_hash_bytes, static_cast(discriminant_size_bits)); + integer L = root(-D, 4); + + form x = DeserializeForm(D, x_s, x_s_size); + + std::atomic stopped(false); + ProgressOneWesolowskiCallback weso( + D, + x, + num_iterations, + progress_interval, + progress_cb, + progress_user_data); + + // Run the fast repeated-squaring engine to `num_iterations`. + // The callback stores all intermediates needed for the proof. + FastStorage* fast_storage = nullptr; + repeated_square(num_iterations, x, D, L, &weso, fast_storage, stopped); + + // Now generate the compact proof from the stored intermediates. + Proof proof = ProveOneWesolowski(num_iterations, D, x, &weso, stopped); + if (proof.y.empty() || proof.proof.empty()) { + return empty_result(); + } + + const size_t total = proof.y.size() + proof.proof.size(); + uint8_t* out = new uint8_t[total]; + std::copy(proof.y.begin(), proof.y.end(), out); + std::copy(proof.proof.begin(), proof.proof.end(), out + proof.y.size()); + return ChiavdfByteArray{out, total}; + } catch (...) { + return empty_result(); + } +} + +extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming( + const uint8_t* challenge_hash, + size_t challenge_size, + const uint8_t* x_s, + size_t x_s_size, + const uint8_t* y_ref_s, + size_t y_ref_s_size, + size_t discriminant_size_bits, + uint64_t num_iterations) { + return chiavdf_prove_one_weso_fast_streaming_with_progress( + challenge_hash, + challenge_size, + x_s, + x_s_size, + y_ref_s, + y_ref_s_size, + discriminant_size_bits, + num_iterations, + /*progress_interval=*/0, + /*progress_cb=*/nullptr, + /*progress_user_data=*/nullptr); +} + +extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_with_progress( + const uint8_t* challenge_hash, + size_t challenge_size, + const uint8_t* x_s, + size_t x_s_size, + const uint8_t* y_ref_s, + size_t y_ref_s_size, + size_t discriminant_size_bits, + uint64_t num_iterations, + uint64_t progress_interval, + ChiavdfProgressCallback progress_cb, + void* progress_user_data) { + try { + return chiavdf_prove_one_weso_fast_streaming_impl( + challenge_hash, + challenge_size, + x_s, + x_s_size, + y_ref_s, + y_ref_s_size, + discriminant_size_bits, + num_iterations, + progress_interval, + progress_cb, + progress_user_data, + /*use_getblock_opt=*/false); + } catch (...) { + return empty_result(); + } +} + +extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt( + const uint8_t* challenge_hash, + size_t challenge_size, + const uint8_t* x_s, + size_t x_s_size, + const uint8_t* y_ref_s, + size_t y_ref_s_size, + size_t discriminant_size_bits, + uint64_t num_iterations) { + return chiavdf_prove_one_weso_fast_streaming_getblock_opt_with_progress( + challenge_hash, + challenge_size, + x_s, + x_s_size, + y_ref_s, + y_ref_s_size, + discriminant_size_bits, + num_iterations, + /*progress_interval=*/0, + /*progress_cb=*/nullptr, + /*progress_user_data=*/nullptr); +} + +extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt_with_progress( + const uint8_t* challenge_hash, + size_t challenge_size, + const uint8_t* x_s, + size_t x_s_size, + const uint8_t* y_ref_s, + size_t y_ref_s_size, + size_t discriminant_size_bits, + uint64_t num_iterations, + uint64_t progress_interval, + ChiavdfProgressCallback progress_cb, + void* progress_user_data) { + try { + return chiavdf_prove_one_weso_fast_streaming_impl( + challenge_hash, + challenge_size, + x_s, + x_s_size, + y_ref_s, + y_ref_s_size, + discriminant_size_bits, + num_iterations, + progress_interval, + progress_cb, + progress_user_data, + /*use_getblock_opt=*/true); + } catch (...) { + return empty_result(); + } +} + +extern "C" void chiavdf_set_bucket_memory_budget_bytes(uint64_t bytes) { + bucket_memory_budget_bytes.store(bytes, std::memory_order_relaxed); +} + +extern "C" void chiavdf_set_enable_streaming_stats(bool enable) { + streaming_stats_enabled.store(enable, std::memory_order_relaxed); + last_streaming_stats = LastStreamingStats{}; +} + +extern "C" bool chiavdf_get_last_streaming_parameters(uint32_t* out_k, uint32_t* out_l, bool* out_tuned) { + if (out_k == nullptr || out_l == nullptr || out_tuned == nullptr) { + return false; + } + if (!last_streaming_parameters.set) { + return false; + } + *out_k = last_streaming_parameters.k; + *out_l = last_streaming_parameters.l; + *out_tuned = last_streaming_parameters.tuned; + return true; +} + +extern "C" bool chiavdf_get_last_streaming_stats( + uint64_t* out_checkpoint_total_ns, + uint64_t* out_checkpoint_event_total_ns, + uint64_t* out_finalize_total_ns, + uint64_t* out_checkpoint_calls, + uint64_t* out_bucket_updates) { + if (out_checkpoint_total_ns == nullptr || out_checkpoint_event_total_ns == nullptr || + out_finalize_total_ns == nullptr || out_checkpoint_calls == nullptr || + out_bucket_updates == nullptr) { + return false; + } + if (!last_streaming_stats.set) { + return false; + } + *out_checkpoint_total_ns = last_streaming_stats.checkpoint_total_ns; + *out_checkpoint_event_total_ns = last_streaming_stats.checkpoint_event_total_ns; + *out_finalize_total_ns = last_streaming_stats.finalize_total_ns; + *out_checkpoint_calls = last_streaming_stats.checkpoint_calls; + *out_bucket_updates = last_streaming_stats.bucket_updates; + return true; +} + +extern "C" void chiavdf_free_byte_array(ChiavdfByteArray array) { delete[] array.data; } diff --git a/src/c_bindings/fast_wrapper.h b/src/c_bindings/fast_wrapper.h new file mode 100644 index 00000000..bf33f320 --- /dev/null +++ b/src/c_bindings/fast_wrapper.h @@ -0,0 +1,145 @@ +#pragma once + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + uint8_t* data; + size_t length; +} ChiavdfByteArray; + +typedef void (*ChiavdfProgressCallback)(uint64_t iters_done, void* user_data); + +// Configure the per-process memory budget used by the parameter tuner when +// selecting `(k,l)` for streaming/bucket-based proving. +// +// The budget is per worker process (not global across multiple processes). +// +// If `bytes` is 0, the default chiavdf heuristic is used. +void chiavdf_set_bucket_memory_budget_bytes(uint64_t bytes); + +// Debug helper: returns the `(k,l)` parameters selected for the most recent +// streaming proof computed on the current thread. +// +// Returns true if parameters are available. +bool chiavdf_get_last_streaming_parameters(uint32_t* out_k, uint32_t* out_l, bool* out_tuned); + +// Enable lightweight timing counters for the streaming prover. +// +// When enabled, the native library records basic timing counters for the most +// recent streaming proof computed on the current thread. This is intended for +// benchmarking and tuning; production runs should keep this disabled to avoid +// extra overhead. +void chiavdf_set_enable_streaming_stats(bool enable); + +// Debug helper: returns timing counters for the most recent streaming proof on +// the current thread. +// +// Returns true if stats are available (i.e. stats enabled and a streaming proof +// was computed successfully). +bool chiavdf_get_last_streaming_stats( + uint64_t* out_checkpoint_total_ns, + uint64_t* out_checkpoint_event_total_ns, + uint64_t* out_finalize_total_ns, + uint64_t* out_checkpoint_calls, + uint64_t* out_bucket_updates); + +// Computes a compact (witness_type=0) Wesolowski proof using the fast engine. +// +// On success, returns `y || proof` where: +// - `y` is the serialized output form (typically 100 bytes for 1024-bit discriminants) +// - `proof` is the serialized witness form (same size as `y`) +// +// On failure, returns `{NULL, 0}`. +ChiavdfByteArray chiavdf_prove_one_weso_fast( + const uint8_t* challenge_hash, + size_t challenge_size, + const uint8_t* x_s, + size_t x_s_size, + size_t discriminant_size_bits, + uint64_t num_iterations); + +// Same as `chiavdf_prove_one_weso_fast`, but optionally invokes `progress_cb` from +// the proving thread every `progress_interval` iterations completed. +// +// If `progress_cb` is NULL or `progress_interval` is 0, no progress is reported. +ChiavdfByteArray chiavdf_prove_one_weso_fast_with_progress( + const uint8_t* challenge_hash, + size_t challenge_size, + const uint8_t* x_s, + size_t x_s_size, + size_t discriminant_size_bits, + uint64_t num_iterations, + uint64_t progress_interval, + ChiavdfProgressCallback progress_cb, + void* progress_user_data); + +// Computes a compact (witness_type=0) Wesolowski proof using the "streaming" +// bucket-accumulation algorithm (Trick 1), which requires the expected output +// `y_ref` up front (as used by bluebox compaction jobs). +// +// On success, returns `y || proof` (same format as `chiavdf_prove_one_weso_fast`). +ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming( + const uint8_t* challenge_hash, + size_t challenge_size, + const uint8_t* x_s, + size_t x_s_size, + const uint8_t* y_ref_s, + size_t y_ref_s_size, + size_t discriminant_size_bits, + uint64_t num_iterations); + +// Same as `chiavdf_prove_one_weso_fast_streaming`, but optionally invokes +// `progress_cb` from the proving thread every `progress_interval` iterations. +ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_with_progress( + const uint8_t* challenge_hash, + size_t challenge_size, + const uint8_t* x_s, + size_t x_s_size, + const uint8_t* y_ref_s, + size_t y_ref_s_size, + size_t discriminant_size_bits, + uint64_t num_iterations, + uint64_t progress_interval, + ChiavdfProgressCallback progress_cb, + void* progress_user_data); + +// Same as `chiavdf_prove_one_weso_fast_streaming`, but with an optimized +// implementation of the `GetBlock()` mapping (avoids per-block modular +// exponentiation without allocating a full `GetBlock` table). +ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt( + const uint8_t* challenge_hash, + size_t challenge_size, + const uint8_t* x_s, + size_t x_s_size, + const uint8_t* y_ref_s, + size_t y_ref_s_size, + size_t discriminant_size_bits, + uint64_t num_iterations); + +// Same as `chiavdf_prove_one_weso_fast_streaming_getblock_opt`, but optionally +// invokes `progress_cb` from the proving thread every `progress_interval` +// iterations. +ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt_with_progress( + const uint8_t* challenge_hash, + size_t challenge_size, + const uint8_t* x_s, + size_t x_s_size, + const uint8_t* y_ref_s, + size_t y_ref_s_size, + size_t discriminant_size_bits, + uint64_t num_iterations, + uint64_t progress_interval, + ChiavdfProgressCallback progress_cb, + void* progress_user_data); + +void chiavdf_free_byte_array(ChiavdfByteArray array); + +#ifdef __cplusplus +} +#endif diff --git a/src/threading.h b/src/threading.h index 3244b3c3..8354d824 100644 --- a/src/threading.h +++ b/src/threading.h @@ -566,8 +566,8 @@ struct alignas(64) thread_counter { } }; -thread_counter master_counter[100]; -thread_counter slave_counter[100]; +thread_counter master_counter[512]; +thread_counter slave_counter[512]; struct thread_state { int pairindex; diff --git a/src/vdf.h b/src/vdf.h index 7bb911f9..f24c09c6 100644 --- a/src/vdf.h +++ b/src/vdf.h @@ -87,6 +87,18 @@ std::mutex new_event_mutex, cout_lock; bool debug_mode = false; bool fast_algorithm = false; bool two_weso = false; +bool quiet_mode = false; + +// vdf_fast uses shared master/slave counters keyed by `square_state.pairindex`. +// The upstream chiavdf binaries run one VDF per process and hardcode `pairindex=0`. +// In embedded/multi-worker setups (like WesoForge), multiple VDF computations can +// run concurrently in the same process; they must not share a pairindex. +inline int vdf_fast_pairindex() { + constexpr int kSlots = int(sizeof(master_counter) / sizeof(master_counter[0])); + static std::atomic next_slot{0}; + thread_local int slot = next_slot.fetch_add(1, std::memory_order_relaxed) % kSlots; + return slot; +} //always works void repeated_square_original(vdf_original &vdfo, form& f, const integer&, const integer&, uint64 base, uint64 iterations, INUDUPLListener *nuduplListener) { @@ -195,7 +207,7 @@ void repeated_square(uint64_t iterations, form f, const integer& D, const intege #if (defined(ARCH_X86) || defined(ARCH_X64)) && !defined(CHIA_DISABLE_ASM) // x86/x64: use the phased pipeline. square_state_type square_state; - square_state.pairindex = 0; + square_state.pairindex = vdf_fast_pairindex(); actual_iterations = repeated_square_fast(square_state, f, D, L, num_iterations, batch_size, weso); #else // Non-x86: use the C++ NUDUPL path (faster and lower maintenance than the phased pipeline). @@ -298,10 +310,12 @@ void repeated_square(uint64_t iterations, form f, const integer& D, const intege } #endif } - { - // this shouldn't be needed but avoids some false positive in TSAN - std::lock_guard lk(cout_lock); - std::cout << "VDF loop finished. Total iters: " << num_iterations << "\n" << std::flush; + if (!quiet_mode) { + { + // this shouldn't be needed but avoids some false positive in TSAN + std::lock_guard lk(cout_lock); + std::cout << "VDF loop finished. Total iters: " << num_iterations << "\n" << std::flush; + } } #ifdef VDF_TEST @@ -337,11 +351,6 @@ Proof ProveOneWesolowski(uint64_t iters, integer& D, form f, OneWesolowskiCallba proof_serialized = SerializeForm(proof_form, d_bits); Proof proof(y_serialized, proof_serialized); proof.witness_type = 0; - { - // this shouldn't be needed but avoids some false positive in TSAN - std::lock_guard lk(cout_lock); - std::cout << "Got simple weso proof: " << proof.hex() << "\n"; - } return proof; } From 7be07522d02eca9fc65ca9b34b96eb2057659e76 Mon Sep 17 00:00:00 2001 From: Gene Hoffman Date: Mon, 23 Feb 2026 23:41:52 -0800 Subject: [PATCH 02/13] Fix non-x86 build break in vdf_fast_pairindex. Guard the fast pairindex slot selection behind the existing x86/asm feature checks and return slot 0 on non-x86 targets, where threading counters are not compiled. Co-authored-by: Cursor --- src/vdf.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/vdf.h b/src/vdf.h index f24c09c6..c2f8834f 100644 --- a/src/vdf.h +++ b/src/vdf.h @@ -94,10 +94,14 @@ bool quiet_mode = false; // In embedded/multi-worker setups (like WesoForge), multiple VDF computations can // run concurrently in the same process; they must not share a pairindex. inline int vdf_fast_pairindex() { +#if (defined(ARCH_X86) || defined(ARCH_X64)) && !defined(CHIA_DISABLE_ASM) constexpr int kSlots = int(sizeof(master_counter) / sizeof(master_counter[0])); static std::atomic next_slot{0}; thread_local int slot = next_slot.fetch_add(1, std::memory_order_relaxed) % kSlots; return slot; +#else + return 0; +#endif } //always works From 3755be28167c172b1bb6115b081d1d23903a6d98 Mon Sep 17 00:00:00 2001 From: Gene Hoffman Date: Mon, 23 Feb 2026 23:57:04 -0800 Subject: [PATCH 03/13] Ensure cmake is present on macOS CI runners. Install cmake via Homebrew and export its bin path in the C libraries and wheel workflows so self-hosted macOS jobs don't fail when cmake is missing from PATH. Co-authored-by: Cursor --- .github/workflows/build-c-libraries.yml | 11 +++++++++++ .github/workflows/build.yml | 11 +++++++++++ 2 files changed, 22 insertions(+) diff --git a/.github/workflows/build-c-libraries.yml b/.github/workflows/build-c-libraries.yml index 00ca38c9..db833104 100644 --- a/.github/workflows/build-c-libraries.yml +++ b/.github/workflows/build-c-libraries.yml @@ -82,6 +82,17 @@ jobs: fetch-depth: 1 path: mpir_gc_x64 + - name: Ensure cmake available (macOS) + if: matrix.os.matrix == 'macos' + shell: bash + run: | + brew ls --versions cmake >/dev/null 2>&1 || brew install cmake + CMAKE_BIN="$(brew --prefix cmake)/bin" + if [ -d "$CMAKE_BIN" ]; then + echo "$CMAKE_BIN" >> "$GITHUB_PATH" + fi + cmake --version + - name: Build working-directory: src env: diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index cd6bec02..4ad967ec 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -102,6 +102,17 @@ jobs: with: python-version: ${{ matrix.python.major-dot-minor }} + - name: Ensure cmake available (macOS) + if: matrix.os.matrix == 'macos' + shell: bash + run: | + brew ls --versions cmake >/dev/null 2>&1 || brew install cmake + CMAKE_BIN="$(brew --prefix cmake)/bin" + if [ -d "$CMAKE_BIN" ]; then + echo "$CMAKE_BIN" >> "$GITHUB_PATH" + fi + cmake --version + - name: Install pipx run: | pip install pipx From 073427ac5cc60adea976104e1aff47bd9fb2ccc4 Mon Sep 17 00:00:00 2001 From: Gene Hoffman Date: Tue, 24 Feb 2026 00:00:51 -0800 Subject: [PATCH 04/13] Improve fast-path batch replay handling and harden pairindex slot allocation. Track and roll back per-batch checkpoints when replaying a failed fast batch, and switch pairindex slot allocation to unsigned atomics to avoid negative modulo indexing after counter wraparound. Co-authored-by: Cursor --- src/c_bindings/fast_wrapper.cpp | 44 ++++++++++++++++++++++++++++++++- src/callback.h | 8 ++++++ src/vdf.h | 12 ++++++--- 3 files changed, 60 insertions(+), 4 deletions(-) diff --git a/src/c_bindings/fast_wrapper.cpp b/src/c_bindings/fast_wrapper.cpp index 198d0a87..5f01e905 100644 --- a/src/c_bindings/fast_wrapper.cpp +++ b/src/c_bindings/fast_wrapper.cpp @@ -243,6 +243,9 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback { } SetForm(type, data, &checkpoint); process_checkpoint(pos, checkpoint, /*record_stats=*/true); + if (iteration >= batch_start_iteration && iteration <= batch_end_iteration) { + current_batch_checkpoints.push_back(BatchCheckpoint{pos, checkpoint}); + } if (stats_enabled) { checkpoint_event_total_ns += static_cast( std::chrono::duration_cast( @@ -258,7 +261,44 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback { } } + void OnBatchStart(uint64_t base_iteration, uint64_t batch_size) override { + current_batch_checkpoints.clear(); + if (batch_size == 0) { + batch_start_iteration = 1; + batch_end_iteration = 0; + return; + } + batch_start_iteration = base_iteration + 1; + if (std::numeric_limits::max() - base_iteration < batch_size) { + batch_end_iteration = std::numeric_limits::max(); + } else { + batch_end_iteration = base_iteration + batch_size; + } + } + + void OnBatchReplay(uint64_t base_iteration, uint64_t batch_size) override { + for (const BatchCheckpoint& entry : current_batch_checkpoints) { + rollback_checkpoint(entry.index, entry.checkpoint); + } + OnBatchStart(base_iteration, batch_size); + } + void process_checkpoint(uint64_t i, const form& checkpoint, bool record_stats) { + apply_checkpoint(i, checkpoint, record_stats); + } + + private: + struct BatchCheckpoint { + uint64_t index; + form checkpoint; + }; + + void rollback_checkpoint(uint64_t i, const form& checkpoint) { + form inverse_checkpoint = checkpoint.inverse(); + apply_checkpoint(i, inverse_checkpoint, /*record_stats=*/false); + } + + void apply_checkpoint(uint64_t i, const form& checkpoint, bool record_stats) { const bool do_stats = stats_enabled && record_stats; auto started_at = std::chrono::steady_clock::time_point{}; if (do_stats) { @@ -359,7 +399,6 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback { return out; } - private: form& bucket(uint32_t j, uint64_t b) { size_t idx = static_cast(j) * (1ULL << k) + static_cast(b); return buckets[idx]; @@ -391,6 +430,9 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback { integer getblock_inv_2k; integer getblock_r; integer getblock_tmp; + uint64_t batch_start_iteration = 1; + uint64_t batch_end_iteration = 0; + std::vector current_batch_checkpoints; bool stats_enabled; uint64_t checkpoint_total_ns = 0; diff --git a/src/callback.h b/src/callback.h index f4764bbf..9ebf3543 100644 --- a/src/callback.h +++ b/src/callback.h @@ -73,6 +73,14 @@ class WesolowskiCallback :public INUDUPLListener { } virtual void OnIteration(int type, void *data, uint64_t iteration) = 0; + virtual void OnBatchStart(uint64_t base_iteration, uint64_t batch_size) { + (void)base_iteration; + (void)batch_size; + } + virtual void OnBatchReplay(uint64_t base_iteration, uint64_t batch_size) { + (void)base_iteration; + (void)batch_size; + } std::unique_ptr forms; size_t forms_capacity = 0; diff --git a/src/vdf.h b/src/vdf.h index c2f8834f..8ca75d8c 100644 --- a/src/vdf.h +++ b/src/vdf.h @@ -95,9 +95,9 @@ bool quiet_mode = false; // run concurrently in the same process; they must not share a pairindex. inline int vdf_fast_pairindex() { #if (defined(ARCH_X86) || defined(ARCH_X64)) && !defined(CHIA_DISABLE_ASM) - constexpr int kSlots = int(sizeof(master_counter) / sizeof(master_counter[0])); - static std::atomic next_slot{0}; - thread_local int slot = next_slot.fetch_add(1, std::memory_order_relaxed) % kSlots; + constexpr unsigned int kSlots = unsigned(sizeof(master_counter) / sizeof(master_counter[0])); + static std::atomic next_slot{0}; + thread_local int slot = int(next_slot.fetch_add(1u, std::memory_order_relaxed) % kSlots); return slot; #else return 0; @@ -201,6 +201,9 @@ void repeated_square(uint64_t iterations, form f, const integer& D, const intege #endif uint64 batch_size=c_checkpoint_interval; + if (weso != NULL) { + weso->OnBatchStart(num_iterations, batch_size); + } #ifdef ENABLE_TRACK_CYCLES print( "track cycles enabled; results will be wrong" ); @@ -231,6 +234,9 @@ void repeated_square(uint64_t iterations, form f, const integer& D, const intege if (actual_iterations==~uint64(0)) { //corruption; f is unchanged. do the entire batch with the slow algorithm + if (weso != NULL) { + weso->OnBatchReplay(num_iterations, batch_size); + } repeated_square_original(*weso->vdfo, f, D, L, num_iterations, batch_size, weso); actual_iterations=batch_size; From fd000ab88ded1cc3386acd892401cb626ae3cc14 Mon Sep 17 00:00:00 2001 From: Gene Hoffman Date: Tue, 24 Feb 2026 00:33:22 -0800 Subject: [PATCH 05/13] Clarify batch iteration indexing in streaming callback. Document that batch bounds use completed-iteration base values while OnIteration is normalized to 1-based indices to avoid ambiguity in replay tracking. Co-authored-by: Cursor --- src/c_bindings/fast_wrapper.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/c_bindings/fast_wrapper.cpp b/src/c_bindings/fast_wrapper.cpp index 5f01e905..61d24599 100644 --- a/src/c_bindings/fast_wrapper.cpp +++ b/src/c_bindings/fast_wrapper.cpp @@ -268,6 +268,8 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback { batch_end_iteration = 0; return; } + // `base_iteration` is the number of completed iterations before this batch. + // `OnIteration` normalizes to 1-based (`iteration++`), so this batch is [base+1, base+size]. batch_start_iteration = base_iteration + 1; if (std::numeric_limits::max() - base_iteration < batch_size) { batch_end_iteration = std::numeric_limits::max(); From 3f82dc2786903e0029d92fc36d5f95f6a04d94bc Mon Sep 17 00:00:00 2001 From: Gene Hoffman Date: Tue, 24 Feb 2026 00:48:29 -0800 Subject: [PATCH 06/13] Add streaming tuner diagnostics and batch fast-wrapper APIs. Expose missing batch C bindings and debug visibility so downstream Rust tests can validate tuner behavior end-to-end. Co-authored-by: Cursor --- pr1_upstream_ready.patch | 1158 +++++++++++++++++++++++++++++++ src/c_bindings/fast_wrapper.cpp | 168 +++++ src/c_bindings/fast_wrapper.h | 32 + 3 files changed, 1358 insertions(+) create mode 100644 pr1_upstream_ready.patch diff --git a/pr1_upstream_ready.patch b/pr1_upstream_ready.patch new file mode 100644 index 00000000..b14a93bb --- /dev/null +++ b/pr1_upstream_ready.patch @@ -0,0 +1,1158 @@ +diff --git a/src/Makefile.vdf-client b/src/Makefile.vdf-client +index ed41963..ca55a95 100644 +--- a/src/Makefile.vdf-client ++++ b/src/Makefile.vdf-client +@@ -6,9 +6,24 @@ else + NOPIE = -no-pie + endif + +-LDFLAGS += -flto $(NOPIE) -g ++# Optional: override `LTO=` to disable link-time optimization. ++LTO ?= -flto ++ ++# Optional: set `PIC=1` to build position-independent objects (recommended when ++# linking chiavdf code into other PIE/shared-library binaries). ++PIC ?= 0 ++ifeq ($(PIC),1) ++PICFLAGS = -fPIC ++PIEFLAGS = ++else ++PICFLAGS = ++PIEFLAGS = $(NOPIE) ++endif ++ ++LDFLAGS += $(LTO) $(PIEFLAGS) -g + LDLIBS += -lgmpxx -lgmp -pthread +-CXXFLAGS += -flto -std=c++1z -D VDF_MODE=0 -D FAST_MACHINE=1 -pthread $(NOPIE) -fvisibility=hidden ++CXXFLAGS += $(LTO) -std=c++1z -D VDF_MODE=0 -D FAST_MACHINE=1 -pthread $(PIEFLAGS) $(PICFLAGS) -fvisibility=hidden ++ASFLAGS += $(PICFLAGS) + ifeq ($(UNAME),Darwin) + CXXFLAGS += -D CHIAOSX=1 + endif +@@ -31,7 +46,7 @@ BINS = vdf_client prover_test 1weso_test 2weso_test vdf_bench + all: $(BINS) + + clean: +- rm -f *.o hw/*.o $(BINS) compile_asm emu_hw_test hw_test hw_vdf_client emu_hw_vdf_client ++ rm -f *.o hw/*.o c_bindings/*.o $(BINS) compile_asm emu_hw_test hw_test hw_vdf_client emu_hw_vdf_client libchiavdf_fastc.a + + $(BINS) avx512_test: %: %.o lzcnt.o asm_compiled.o avx2_asm_compiled.o avx512_asm_compiled.o + $(CXX) $(LDFLAGS) -o $@ $^ $(LDLIBS) +@@ -39,7 +54,10 @@ $(BINS) avx512_test: %: %.o lzcnt.o asm_compiled.o avx2_asm_compiled.o avx512_as + $(addsuffix .o,$(BINS)) avx512_test.o: CXXFLAGS += $(OPT_CFLAGS) + + lzcnt.o: refcode/lzcnt.c +- $(CC) -c refcode/lzcnt.c ++ $(CC) -c refcode/lzcnt.c $(OPT_CFLAGS) $(PICFLAGS) ++ ++%.o: %.s ++ $(CC) -c $< -o $@ $(ASFLAGS) + + asm_compiled.s: compile_asm + ./compile_asm +@@ -53,6 +71,22 @@ avx512_asm_compiled.s: compile_asm + compile_asm: compile_asm.o + $(CXX) $(LDFLAGS) -o $@ $^ $(LDLIBS) + ++# --------------------------------------------------------------------------- ++# Static library: fast one-wesolowski proof (BBR integration) ++# --------------------------------------------------------------------------- ++ ++FASTLIB = libchiavdf_fastc.a ++FASTLIB_OBJS = c_bindings/fast_wrapper.o lzcnt.o asm_compiled.o avx2_asm_compiled.o avx512_asm_compiled.o ++ ++.PHONY: fastlib ++ ++fastlib: $(FASTLIB) ++ ++$(FASTLIB): $(FASTLIB_OBJS) ++ $(AR) rcs $@ $^ ++ ++c_bindings/fast_wrapper.o: CXXFLAGS += $(OPT_CFLAGS) ++ + HW_OBJS = $(addprefix hw/,hw_util.o hw_proof.o hw_interface.o chia_driver.o ftdi_driver.o vdf_driver.o pll_freqs.o) vdf_base.o lzcnt.o + EMU_OBJS = hw/emu_funcs.o hw/emu_runner.o + HW_LIB = hw/libft4222/build-x86_64/libft4222.so +diff --git a/src/c_bindings/fast_wrapper.cpp b/src/c_bindings/fast_wrapper.cpp +new file mode 100644 +index 0000000..198d0a8 +--- /dev/null ++++ b/src/c_bindings/fast_wrapper.cpp +@@ -0,0 +1,795 @@ ++#include "fast_wrapper.h" ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include "../vdf.h" ++#include "../create_discriminant.h" ++ ++// Runtime configuration knobs required by `parameters.h`. ++// These are `extern` variables there, but each binary defines them explicitly. ++bool use_divide_table = false; ++int gcd_base_bits = 50; ++int gcd_128_max_iter = 3; ++std::string asmprefix = "cel_"; ++bool enable_all_instructions = false; ++ ++namespace { ++std::once_flag init_once; ++std::atomic bucket_memory_budget_bytes(128ULL * 1024ULL * 1024ULL); ++std::atomic streaming_stats_enabled(false); ++ ++struct LastStreamingParameters { ++ uint32_t k = 0; ++ uint32_t l = 0; ++ bool tuned = false; ++ bool set = false; ++}; ++ ++thread_local LastStreamingParameters last_streaming_parameters; ++ ++struct LastStreamingStats { ++ uint64_t checkpoint_total_ns = 0; ++ uint64_t checkpoint_event_total_ns = 0; ++ uint64_t finalize_total_ns = 0; ++ uint64_t checkpoint_calls = 0; ++ uint64_t bucket_updates = 0; ++ bool set = false; ++}; ++ ++thread_local LastStreamingStats last_streaming_stats; ++ ++void init_chiavdf_fast() { ++ init_gmp(); ++ set_rounding_mode(); ++ ++ // Match the vdf_client runtime selection for AVX2. ++ if (hasAVX2()) { ++ gcd_base_bits = 63; ++ gcd_128_max_iter = 2; ++ } else { ++ gcd_base_bits = 50; ++ gcd_128_max_iter = 3; ++ } ++ ++ // Ensure we run the one-wesolowski path by default. ++ fast_algorithm = false; ++ two_weso = false; ++ quiet_mode = true; ++} ++ ++ChiavdfByteArray empty_result() { return ChiavdfByteArray{nullptr, 0}; } ++ ++uint64_t estimate_bucket_form_bytes(size_t discriminant_size_bits) { ++ // Be conservative: class group forms contain 3 GMP-backed integers that ++ // quickly grow to the discriminant size (or beyond) during NUCOMP. ++ // ++ // This estimate is intentionally larger than the raw serialized size to ++ // avoid picking parameters that risk paging/OOM. ++ uint64_t discr_bytes = (static_cast(discriminant_size_bits) + 7) / 8; ++ uint64_t estimate = discr_bytes * 16; ++ if (estimate < 2048) { ++ estimate = 2048; ++ } ++ return estimate; ++} ++ ++bool tune_streaming_parameters( ++ uint64_t num_iterations, ++ size_t discriminant_size_bits, ++ uint64_t memory_budget_bytes, ++ uint32_t& out_l, ++ uint32_t& out_k) { ++ if (memory_budget_bytes == 0) { ++ return false; ++ } ++ ++ // Keep headroom for GMP scratch allocations and general process overhead. ++ uint64_t budget = (memory_budget_bytes * 80) / 100; ++ uint64_t bytes_per_form = estimate_bucket_form_bytes(discriminant_size_bits); ++ if (budget < bytes_per_form) { ++ return false; ++ } ++ ++ unsigned __int128 best_cost = std::numeric_limits::max(); ++ bool found = false; ++ ++ // Empirical tuning notes (1024-bit discriminants, AVX2 build): ++ // - Each bucket update (NUCOMP) and each fold unit is ~5µs. ++ // - Per-checkpoint event overhead (SetForm + bookkeeping) is ~0.3µs. ++ // ++ // So checkpoint counts should be weighted much lower than updates/fold. ++ constexpr unsigned __int128 update_weight = 16; ++ constexpr unsigned __int128 fold_weight = 16; ++ constexpr unsigned __int128 checkpoint_weight = 1; ++ ++ // Search a small grid of `(k,l)` values. Higher `k` reduces checkpoint work ++ // (~T/k) but increases fold work (~l·2^k) and bucket memory (~l·2^k). ++ for (uint32_t k = 4; k <= 20; k++) { ++ unsigned __int128 buckets_per_row = static_cast(1) << k; ++ ++ for (uint32_t l = 1; l <= 64; l++) { ++ unsigned __int128 form_count = buckets_per_row * static_cast(l); ++ unsigned __int128 mem_required = ++ form_count * static_cast(bytes_per_form); ++ if (mem_required > static_cast(budget)) { ++ continue; ++ } ++ ++ unsigned __int128 updates = static_cast( ++ (num_iterations + static_cast(k) - 1) / static_cast(k)); ++ uint64_t kl = static_cast(k) * static_cast(l); ++ unsigned __int128 checkpoints = static_cast( ++ (num_iterations + kl - 1) / kl); ++ unsigned __int128 fold = static_cast(l) << (k + 1); ++ unsigned __int128 cost = ++ updates * update_weight + checkpoints * checkpoint_weight + fold * fold_weight; ++ ++ if (!found || cost < best_cost) { ++ found = true; ++ best_cost = cost; ++ out_k = k; ++ out_l = l; ++ } ++ } ++ } ++ ++ return found; ++} ++ ++uint64_t get_block(uint64_t i, uint64_t k, uint64_t T, integer& B) { ++ integer res = FastPow(2, T - k * (i + 1), B); ++ mpz_mul_2exp(res.impl, res.impl, k); ++ res = res / B; ++ auto res_vector = res.to_vector(); ++ return res_vector.empty() ? 0 : res_vector[0]; ++} ++ ++class ProgressOneWesolowskiCallback final : public OneWesolowskiCallback { ++ public: ++ ProgressOneWesolowskiCallback( ++ integer& D, ++ form& f, ++ uint64_t wanted_iter, ++ uint64_t progress_interval, ++ ChiavdfProgressCallback progress_cb, ++ void* progress_user_data) ++ : OneWesolowskiCallback(D, f, wanted_iter), ++ progress_interval(progress_interval), ++ progress_cb(progress_cb), ++ progress_user_data(progress_user_data), ++ next_progress(progress_interval) {} ++ ++ void OnIteration(int type, void* data, uint64_t iteration) override { ++ OneWesolowskiCallback::OnIteration(type, data, iteration); ++ ++ if (progress_cb == nullptr || progress_interval == 0) { ++ return; ++ } ++ ++ uint64_t done = iteration + 1; ++ if (done > wanted_iter) { ++ return; ++ } ++ ++ if (done >= next_progress) { ++ progress_cb(next_progress, progress_user_data); ++ next_progress += progress_interval; ++ } ++ } ++ ++ private: ++ uint64_t progress_interval; ++ ChiavdfProgressCallback progress_cb; ++ void* progress_user_data; ++ uint64_t next_progress; ++}; ++ ++class StreamingOneWesolowskiCallback final : public WesolowskiCallback { ++ public: ++ StreamingOneWesolowskiCallback( ++ integer& D, ++ uint64_t wanted_iter, ++ uint32_t k, ++ uint32_t l, ++ uint64_t limit, ++ integer& B, ++ bool use_getblock_opt, ++ uint64_t progress_interval, ++ ChiavdfProgressCallback progress_cb, ++ void* progress_user_data) ++ : WesolowskiCallback(D), ++ wanted_iter(wanted_iter), ++ k(k), ++ l(l), ++ kl(static_cast(k) * static_cast(l)), ++ limit(limit), ++ B(B), ++ progress_interval(progress_interval), ++ progress_cb(progress_cb), ++ progress_user_data(progress_user_data), ++ next_progress(progress_interval), ++ use_getblock_opt(use_getblock_opt), ++ stats_enabled(streaming_stats_enabled.load(std::memory_order_relaxed)) { ++ form id = form::identity(D); ++ buckets.resize(static_cast(l) * (1ULL << k), id); ++ ++ if (use_getblock_opt) { ++ getblock_ok = init_getblock_opt_state(); ++ } ++ } ++ ++ void OnIteration(int type, void* data, uint64_t iteration) override { ++ iteration++; ++ if (iteration > wanted_iter) { ++ return; ++ } ++ ++ if (progress_cb != nullptr && progress_interval != 0 && iteration >= next_progress) { ++ progress_cb(next_progress, progress_user_data); ++ next_progress += progress_interval; ++ } ++ ++ if (iteration % kl == 0) { ++ uint64_t pos = iteration / kl; ++ if (pos < limit) { ++ form checkpoint; ++ auto started_at = std::chrono::steady_clock::time_point{}; ++ if (stats_enabled) { ++ started_at = std::chrono::steady_clock::now(); ++ } ++ SetForm(type, data, &checkpoint); ++ process_checkpoint(pos, checkpoint, /*record_stats=*/true); ++ if (stats_enabled) { ++ checkpoint_event_total_ns += static_cast( ++ std::chrono::duration_cast( ++ std::chrono::steady_clock::now() - started_at) ++ .count()); ++ } ++ } ++ } ++ ++ if (iteration == wanted_iter) { ++ SetForm(type, data, &result); ++ has_result = true; ++ } ++ } ++ ++ void process_checkpoint(uint64_t i, const form& checkpoint, bool record_stats) { ++ const bool do_stats = stats_enabled && record_stats; ++ auto started_at = std::chrono::steady_clock::time_point{}; ++ if (do_stats) { ++ started_at = std::chrono::steady_clock::now(); ++ } ++ ++ uint64_t local_updates = 0; ++ for (uint32_t j = 0; j < l; j++) { ++ uint64_t p = i * static_cast(l) + static_cast(j); ++ uint64_t needed = static_cast(k) * (p + 1); ++ if (wanted_iter < needed) { ++ break; ++ } ++ uint64_t b = use_getblock_opt ? get_block_opt(p) : get_block(p, k, wanted_iter, B); ++ if (do_stats) { ++ local_updates++; ++ } ++ nucomp_form(bucket(j, b), bucket(j, b), checkpoint, D, L); ++ } ++ ++ if (do_stats) { ++ checkpoint_calls++; ++ bucket_updates += local_updates; ++ checkpoint_total_ns += static_cast( ++ std::chrono::duration_cast( ++ std::chrono::steady_clock::now() - started_at) ++ .count()); ++ } ++ } ++ ++ bool init_ok() const { return getblock_ok; } ++ ++ bool ok() const { return has_result; } ++ ++ const form& y() const { return result; } ++ ++ form finalize_proof() { ++ auto started_at = std::chrono::steady_clock::time_point{}; ++ if (stats_enabled) { ++ started_at = std::chrono::steady_clock::now(); ++ } ++ ++ PulmarkReducer reducer; ++ form id = form::identity(D); ++ ++ uint64_t k1 = k / 2; ++ uint64_t k0 = k - k1; ++ form x = id; ++ ++ for (int64_t j = static_cast(l) - 1; j >= 0; j--) { ++ x = FastPowFormNucomp(x, D, integer(static_cast(1) << k), L, reducer); ++ ++ for (uint64_t b1 = 0; b1 < (1ULL << k1); b1++) { ++ form z = id; ++ for (uint64_t b0 = 0; b0 < (1ULL << k0); b0++) { ++ nucomp_form(z, z, bucket(static_cast(j), b1 * (1ULL << k0) + b0), D, L); ++ } ++ z = FastPowFormNucomp( ++ z, ++ D, ++ integer(static_cast(b1 * (1ULL << k0))), ++ L, ++ reducer); ++ nucomp_form(x, x, z, D, L); ++ } ++ ++ for (uint64_t b0 = 0; b0 < (1ULL << k0); b0++) { ++ form z = id; ++ for (uint64_t b1 = 0; b1 < (1ULL << k1); b1++) { ++ nucomp_form(z, z, bucket(static_cast(j), b1 * (1ULL << k0) + b0), D, L); ++ } ++ z = FastPowFormNucomp(z, D, integer(b0), L, reducer); ++ nucomp_form(x, x, z, D, L); ++ } ++ } ++ ++ reducer.reduce(x); ++ ++ if (stats_enabled) { ++ finalize_total_ns += static_cast( ++ std::chrono::duration_cast( ++ std::chrono::steady_clock::now() - started_at) ++ .count()); ++ } ++ return x; ++ } ++ ++ bool stats_ok() const { return stats_enabled; } ++ ++ LastStreamingStats stats() const { ++ LastStreamingStats out; ++ out.checkpoint_total_ns = checkpoint_total_ns; ++ out.checkpoint_event_total_ns = checkpoint_event_total_ns; ++ out.finalize_total_ns = finalize_total_ns; ++ out.checkpoint_calls = checkpoint_calls; ++ out.bucket_updates = bucket_updates; ++ out.set = stats_enabled; ++ return out; ++ } ++ ++ private: ++ form& bucket(uint32_t j, uint64_t b) { ++ size_t idx = static_cast(j) * (1ULL << k) + static_cast(b); ++ return buckets[idx]; ++ } ++ ++ const form& bucket(uint32_t j, uint64_t b) const { ++ size_t idx = static_cast(j) * (1ULL << k) + static_cast(b); ++ return buckets[idx]; ++ } ++ ++ uint64_t wanted_iter; ++ uint32_t k; ++ uint32_t l; ++ uint64_t kl; ++ uint64_t limit; ++ integer B; ++ uint64_t progress_interval; ++ ChiavdfProgressCallback progress_cb; ++ void* progress_user_data; ++ uint64_t next_progress; ++ ++ std::vector buckets; ++ form result; ++ bool has_result = false; ++ ++ bool use_getblock_opt; ++ bool getblock_ok = true; ++ uint64_t getblock_next_p = 0; ++ integer getblock_inv_2k; ++ integer getblock_r; ++ integer getblock_tmp; ++ ++ bool stats_enabled; ++ uint64_t checkpoint_total_ns = 0; ++ uint64_t checkpoint_event_total_ns = 0; ++ uint64_t finalize_total_ns = 0; ++ uint64_t checkpoint_calls = 0; ++ uint64_t bucket_updates = 0; ++ ++ bool init_getblock_opt_state() { ++ if (k == 0) { ++ return false; ++ } ++ uint64_t k_u64 = static_cast(k); ++ if (wanted_iter < k_u64) { ++ return true; ++ } ++ ++ integer two_k_mod = FastPow(2, k_u64, B); ++ if (mpz_invert(getblock_inv_2k.impl, two_k_mod.impl, B.impl) == 0) { ++ return false; ++ } ++ ++ getblock_r = FastPow(2, wanted_iter - k_u64, B); ++ getblock_next_p = 0; ++ return true; ++ } ++ ++ uint64_t get_block_opt(uint64_t p) { ++ if (!getblock_ok || wanted_iter < static_cast(k)) { ++ return get_block(p, k, wanted_iter, B); ++ } ++ ++ // Expected call pattern is sequential `p`. If we ever get out of sync, ++ // advance state forward or fall back to the slow mapping. ++ if (p < getblock_next_p) { ++ return get_block(p, k, wanted_iter, B); ++ } ++ while (getblock_next_p < p) { ++ mpz_mul(getblock_r.impl, getblock_r.impl, getblock_inv_2k.impl); ++ mpz_mod(getblock_r.impl, getblock_r.impl, B.impl); ++ getblock_next_p++; ++ } ++ ++ mpz_mul_2exp(getblock_tmp.impl, getblock_r.impl, k); ++ mpz_fdiv_q(getblock_tmp.impl, getblock_tmp.impl, B.impl); ++ uint64_t b = mpz_get_ui(getblock_tmp.impl); ++ ++ mpz_mul(getblock_r.impl, getblock_r.impl, getblock_inv_2k.impl); ++ mpz_mod(getblock_r.impl, getblock_r.impl, B.impl); ++ getblock_next_p++; ++ ++ return b; ++ } ++}; ++ ++ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_impl( ++ const uint8_t* challenge_hash, ++ size_t challenge_size, ++ const uint8_t* x_s, ++ size_t x_s_size, ++ const uint8_t* y_ref_s, ++ size_t y_ref_s_size, ++ size_t discriminant_size_bits, ++ uint64_t num_iterations, ++ uint64_t progress_interval, ++ ChiavdfProgressCallback progress_cb, ++ void* progress_user_data, ++ bool use_getblock_opt) { ++ std::call_once(init_once, init_chiavdf_fast); ++ ++ last_streaming_stats = LastStreamingStats{}; ++ ++ if (challenge_hash == nullptr || challenge_size == 0 || x_s == nullptr || x_s_size == 0 || ++ y_ref_s == nullptr || y_ref_s_size == 0) { ++ return empty_result(); ++ } ++ if (num_iterations == 0) { ++ return empty_result(); ++ } ++ ++ std::vector challenge_hash_bytes(challenge_hash, challenge_hash + challenge_size); ++ integer D = CreateDiscriminant(challenge_hash_bytes, static_cast(discriminant_size_bits)); ++ integer L = root(-D, 4); ++ ++ form x = DeserializeForm(D, x_s, x_s_size); ++ form y_ref = DeserializeForm(D, y_ref_s, y_ref_s_size); ++ ++ uint32_t k; ++ uint32_t l; ++ bool tuned = false; ++ const uint64_t budget = ++ bucket_memory_budget_bytes.load(std::memory_order_relaxed); ++ if (num_iterations >= (1 << 16)) { ++ tuned = tune_streaming_parameters(num_iterations, discriminant_size_bits, budget, l, k); ++ } ++ if (!tuned) { ++ if (num_iterations >= (1 << 16)) { ++ ApproximateParameters(num_iterations, l, k); ++ } else { ++ k = 10; ++ l = 1; ++ } ++ } ++ if (k == 0) { ++ k = 1; ++ } ++ if (l == 0) { ++ l = 1; ++ } ++ ++ last_streaming_parameters.k = k; ++ last_streaming_parameters.l = l; ++ last_streaming_parameters.tuned = tuned; ++ last_streaming_parameters.set = true; ++ ++ uint64_t kl = static_cast(k) * static_cast(l); ++ uint64_t limit = num_iterations / kl; ++ if (num_iterations % kl) { ++ limit++; ++ } ++ ++ integer B = GetB(D, x, y_ref); ++ ++ std::atomic stopped(false); ++ StreamingOneWesolowskiCallback weso( ++ D, ++ num_iterations, ++ k, ++ l, ++ limit, ++ B, ++ use_getblock_opt, ++ progress_interval, ++ progress_cb, ++ progress_user_data); ++ ++ if (!weso.init_ok()) { ++ return empty_result(); ++ } ++ ++ weso.process_checkpoint(/*i=*/0, x, /*record_stats=*/false); ++ ++ FastStorage* fast_storage = nullptr; ++ repeated_square(num_iterations, x, D, L, &weso, fast_storage, stopped); ++ ++ if (!weso.ok()) { ++ return empty_result(); ++ } ++ if (!(weso.y() == y_ref)) { ++ return empty_result(); ++ } ++ ++ form proof_form = weso.finalize_proof(); ++ ++ if (weso.stats_ok()) { ++ last_streaming_stats = weso.stats(); ++ } ++ ++ int d_bits = D.num_bits(); ++ std::vector y_serialized = SerializeForm(y_ref, d_bits); ++ std::vector proof_serialized = SerializeForm(proof_form, d_bits); ++ ++ if (y_serialized.empty() || proof_serialized.empty()) { ++ return empty_result(); ++ } ++ ++ const size_t total = y_serialized.size() + proof_serialized.size(); ++ uint8_t* out = new uint8_t[total]; ++ std::copy(y_serialized.begin(), y_serialized.end(), out); ++ std::copy(proof_serialized.begin(), proof_serialized.end(), out + y_serialized.size()); ++ return ChiavdfByteArray{out, total}; ++} ++} // namespace ++ ++extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast( ++ const uint8_t* challenge_hash, ++ size_t challenge_size, ++ const uint8_t* x_s, ++ size_t x_s_size, ++ size_t discriminant_size_bits, ++ uint64_t num_iterations) { ++ return chiavdf_prove_one_weso_fast_with_progress( ++ challenge_hash, ++ challenge_size, ++ x_s, ++ x_s_size, ++ discriminant_size_bits, ++ num_iterations, ++ /*progress_interval=*/0, ++ /*progress_cb=*/nullptr, ++ /*progress_user_data=*/nullptr); ++} ++ ++extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_with_progress( ++ const uint8_t* challenge_hash, ++ size_t challenge_size, ++ const uint8_t* x_s, ++ size_t x_s_size, ++ size_t discriminant_size_bits, ++ uint64_t num_iterations, ++ uint64_t progress_interval, ++ ChiavdfProgressCallback progress_cb, ++ void* progress_user_data) { ++ try { ++ std::call_once(init_once, init_chiavdf_fast); ++ ++ if (challenge_hash == nullptr || challenge_size == 0 || x_s == nullptr || x_s_size == 0) { ++ return empty_result(); ++ } ++ if (num_iterations == 0) { ++ return empty_result(); ++ } ++ ++ std::vector challenge_hash_bytes(challenge_hash, challenge_hash + challenge_size); ++ integer D = CreateDiscriminant(challenge_hash_bytes, static_cast(discriminant_size_bits)); ++ integer L = root(-D, 4); ++ ++ form x = DeserializeForm(D, x_s, x_s_size); ++ ++ std::atomic stopped(false); ++ ProgressOneWesolowskiCallback weso( ++ D, ++ x, ++ num_iterations, ++ progress_interval, ++ progress_cb, ++ progress_user_data); ++ ++ // Run the fast repeated-squaring engine to `num_iterations`. ++ // The callback stores all intermediates needed for the proof. ++ FastStorage* fast_storage = nullptr; ++ repeated_square(num_iterations, x, D, L, &weso, fast_storage, stopped); ++ ++ // Now generate the compact proof from the stored intermediates. ++ Proof proof = ProveOneWesolowski(num_iterations, D, x, &weso, stopped); ++ if (proof.y.empty() || proof.proof.empty()) { ++ return empty_result(); ++ } ++ ++ const size_t total = proof.y.size() + proof.proof.size(); ++ uint8_t* out = new uint8_t[total]; ++ std::copy(proof.y.begin(), proof.y.end(), out); ++ std::copy(proof.proof.begin(), proof.proof.end(), out + proof.y.size()); ++ return ChiavdfByteArray{out, total}; ++ } catch (...) { ++ return empty_result(); ++ } ++} ++ ++extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming( ++ const uint8_t* challenge_hash, ++ size_t challenge_size, ++ const uint8_t* x_s, ++ size_t x_s_size, ++ const uint8_t* y_ref_s, ++ size_t y_ref_s_size, ++ size_t discriminant_size_bits, ++ uint64_t num_iterations) { ++ return chiavdf_prove_one_weso_fast_streaming_with_progress( ++ challenge_hash, ++ challenge_size, ++ x_s, ++ x_s_size, ++ y_ref_s, ++ y_ref_s_size, ++ discriminant_size_bits, ++ num_iterations, ++ /*progress_interval=*/0, ++ /*progress_cb=*/nullptr, ++ /*progress_user_data=*/nullptr); ++} ++ ++extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_with_progress( ++ const uint8_t* challenge_hash, ++ size_t challenge_size, ++ const uint8_t* x_s, ++ size_t x_s_size, ++ const uint8_t* y_ref_s, ++ size_t y_ref_s_size, ++ size_t discriminant_size_bits, ++ uint64_t num_iterations, ++ uint64_t progress_interval, ++ ChiavdfProgressCallback progress_cb, ++ void* progress_user_data) { ++ try { ++ return chiavdf_prove_one_weso_fast_streaming_impl( ++ challenge_hash, ++ challenge_size, ++ x_s, ++ x_s_size, ++ y_ref_s, ++ y_ref_s_size, ++ discriminant_size_bits, ++ num_iterations, ++ progress_interval, ++ progress_cb, ++ progress_user_data, ++ /*use_getblock_opt=*/false); ++ } catch (...) { ++ return empty_result(); ++ } ++} ++ ++extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt( ++ const uint8_t* challenge_hash, ++ size_t challenge_size, ++ const uint8_t* x_s, ++ size_t x_s_size, ++ const uint8_t* y_ref_s, ++ size_t y_ref_s_size, ++ size_t discriminant_size_bits, ++ uint64_t num_iterations) { ++ return chiavdf_prove_one_weso_fast_streaming_getblock_opt_with_progress( ++ challenge_hash, ++ challenge_size, ++ x_s, ++ x_s_size, ++ y_ref_s, ++ y_ref_s_size, ++ discriminant_size_bits, ++ num_iterations, ++ /*progress_interval=*/0, ++ /*progress_cb=*/nullptr, ++ /*progress_user_data=*/nullptr); ++} ++ ++extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt_with_progress( ++ const uint8_t* challenge_hash, ++ size_t challenge_size, ++ const uint8_t* x_s, ++ size_t x_s_size, ++ const uint8_t* y_ref_s, ++ size_t y_ref_s_size, ++ size_t discriminant_size_bits, ++ uint64_t num_iterations, ++ uint64_t progress_interval, ++ ChiavdfProgressCallback progress_cb, ++ void* progress_user_data) { ++ try { ++ return chiavdf_prove_one_weso_fast_streaming_impl( ++ challenge_hash, ++ challenge_size, ++ x_s, ++ x_s_size, ++ y_ref_s, ++ y_ref_s_size, ++ discriminant_size_bits, ++ num_iterations, ++ progress_interval, ++ progress_cb, ++ progress_user_data, ++ /*use_getblock_opt=*/true); ++ } catch (...) { ++ return empty_result(); ++ } ++} ++ ++extern "C" void chiavdf_set_bucket_memory_budget_bytes(uint64_t bytes) { ++ bucket_memory_budget_bytes.store(bytes, std::memory_order_relaxed); ++} ++ ++extern "C" void chiavdf_set_enable_streaming_stats(bool enable) { ++ streaming_stats_enabled.store(enable, std::memory_order_relaxed); ++ last_streaming_stats = LastStreamingStats{}; ++} ++ ++extern "C" bool chiavdf_get_last_streaming_parameters(uint32_t* out_k, uint32_t* out_l, bool* out_tuned) { ++ if (out_k == nullptr || out_l == nullptr || out_tuned == nullptr) { ++ return false; ++ } ++ if (!last_streaming_parameters.set) { ++ return false; ++ } ++ *out_k = last_streaming_parameters.k; ++ *out_l = last_streaming_parameters.l; ++ *out_tuned = last_streaming_parameters.tuned; ++ return true; ++} ++ ++extern "C" bool chiavdf_get_last_streaming_stats( ++ uint64_t* out_checkpoint_total_ns, ++ uint64_t* out_checkpoint_event_total_ns, ++ uint64_t* out_finalize_total_ns, ++ uint64_t* out_checkpoint_calls, ++ uint64_t* out_bucket_updates) { ++ if (out_checkpoint_total_ns == nullptr || out_checkpoint_event_total_ns == nullptr || ++ out_finalize_total_ns == nullptr || out_checkpoint_calls == nullptr || ++ out_bucket_updates == nullptr) { ++ return false; ++ } ++ if (!last_streaming_stats.set) { ++ return false; ++ } ++ *out_checkpoint_total_ns = last_streaming_stats.checkpoint_total_ns; ++ *out_checkpoint_event_total_ns = last_streaming_stats.checkpoint_event_total_ns; ++ *out_finalize_total_ns = last_streaming_stats.finalize_total_ns; ++ *out_checkpoint_calls = last_streaming_stats.checkpoint_calls; ++ *out_bucket_updates = last_streaming_stats.bucket_updates; ++ return true; ++} ++ ++extern "C" void chiavdf_free_byte_array(ChiavdfByteArray array) { delete[] array.data; } +diff --git a/src/c_bindings/fast_wrapper.h b/src/c_bindings/fast_wrapper.h +new file mode 100644 +index 0000000..bf33f32 +--- /dev/null ++++ b/src/c_bindings/fast_wrapper.h +@@ -0,0 +1,145 @@ ++#pragma once ++ ++#include ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++typedef struct { ++ uint8_t* data; ++ size_t length; ++} ChiavdfByteArray; ++ ++typedef void (*ChiavdfProgressCallback)(uint64_t iters_done, void* user_data); ++ ++// Configure the per-process memory budget used by the parameter tuner when ++// selecting `(k,l)` for streaming/bucket-based proving. ++// ++// The budget is per worker process (not global across multiple processes). ++// ++// If `bytes` is 0, the default chiavdf heuristic is used. ++void chiavdf_set_bucket_memory_budget_bytes(uint64_t bytes); ++ ++// Debug helper: returns the `(k,l)` parameters selected for the most recent ++// streaming proof computed on the current thread. ++// ++// Returns true if parameters are available. ++bool chiavdf_get_last_streaming_parameters(uint32_t* out_k, uint32_t* out_l, bool* out_tuned); ++ ++// Enable lightweight timing counters for the streaming prover. ++// ++// When enabled, the native library records basic timing counters for the most ++// recent streaming proof computed on the current thread. This is intended for ++// benchmarking and tuning; production runs should keep this disabled to avoid ++// extra overhead. ++void chiavdf_set_enable_streaming_stats(bool enable); ++ ++// Debug helper: returns timing counters for the most recent streaming proof on ++// the current thread. ++// ++// Returns true if stats are available (i.e. stats enabled and a streaming proof ++// was computed successfully). ++bool chiavdf_get_last_streaming_stats( ++ uint64_t* out_checkpoint_total_ns, ++ uint64_t* out_checkpoint_event_total_ns, ++ uint64_t* out_finalize_total_ns, ++ uint64_t* out_checkpoint_calls, ++ uint64_t* out_bucket_updates); ++ ++// Computes a compact (witness_type=0) Wesolowski proof using the fast engine. ++// ++// On success, returns `y || proof` where: ++// - `y` is the serialized output form (typically 100 bytes for 1024-bit discriminants) ++// - `proof` is the serialized witness form (same size as `y`) ++// ++// On failure, returns `{NULL, 0}`. ++ChiavdfByteArray chiavdf_prove_one_weso_fast( ++ const uint8_t* challenge_hash, ++ size_t challenge_size, ++ const uint8_t* x_s, ++ size_t x_s_size, ++ size_t discriminant_size_bits, ++ uint64_t num_iterations); ++ ++// Same as `chiavdf_prove_one_weso_fast`, but optionally invokes `progress_cb` from ++// the proving thread every `progress_interval` iterations completed. ++// ++// If `progress_cb` is NULL or `progress_interval` is 0, no progress is reported. ++ChiavdfByteArray chiavdf_prove_one_weso_fast_with_progress( ++ const uint8_t* challenge_hash, ++ size_t challenge_size, ++ const uint8_t* x_s, ++ size_t x_s_size, ++ size_t discriminant_size_bits, ++ uint64_t num_iterations, ++ uint64_t progress_interval, ++ ChiavdfProgressCallback progress_cb, ++ void* progress_user_data); ++ ++// Computes a compact (witness_type=0) Wesolowski proof using the "streaming" ++// bucket-accumulation algorithm (Trick 1), which requires the expected output ++// `y_ref` up front (as used by bluebox compaction jobs). ++// ++// On success, returns `y || proof` (same format as `chiavdf_prove_one_weso_fast`). ++ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming( ++ const uint8_t* challenge_hash, ++ size_t challenge_size, ++ const uint8_t* x_s, ++ size_t x_s_size, ++ const uint8_t* y_ref_s, ++ size_t y_ref_s_size, ++ size_t discriminant_size_bits, ++ uint64_t num_iterations); ++ ++// Same as `chiavdf_prove_one_weso_fast_streaming`, but optionally invokes ++// `progress_cb` from the proving thread every `progress_interval` iterations. ++ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_with_progress( ++ const uint8_t* challenge_hash, ++ size_t challenge_size, ++ const uint8_t* x_s, ++ size_t x_s_size, ++ const uint8_t* y_ref_s, ++ size_t y_ref_s_size, ++ size_t discriminant_size_bits, ++ uint64_t num_iterations, ++ uint64_t progress_interval, ++ ChiavdfProgressCallback progress_cb, ++ void* progress_user_data); ++ ++// Same as `chiavdf_prove_one_weso_fast_streaming`, but with an optimized ++// implementation of the `GetBlock()` mapping (avoids per-block modular ++// exponentiation without allocating a full `GetBlock` table). ++ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt( ++ const uint8_t* challenge_hash, ++ size_t challenge_size, ++ const uint8_t* x_s, ++ size_t x_s_size, ++ const uint8_t* y_ref_s, ++ size_t y_ref_s_size, ++ size_t discriminant_size_bits, ++ uint64_t num_iterations); ++ ++// Same as `chiavdf_prove_one_weso_fast_streaming_getblock_opt`, but optionally ++// invokes `progress_cb` from the proving thread every `progress_interval` ++// iterations. ++ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt_with_progress( ++ const uint8_t* challenge_hash, ++ size_t challenge_size, ++ const uint8_t* x_s, ++ size_t x_s_size, ++ const uint8_t* y_ref_s, ++ size_t y_ref_s_size, ++ size_t discriminant_size_bits, ++ uint64_t num_iterations, ++ uint64_t progress_interval, ++ ChiavdfProgressCallback progress_cb, ++ void* progress_user_data); ++ ++void chiavdf_free_byte_array(ChiavdfByteArray array); ++ ++#ifdef __cplusplus ++} ++#endif +diff --git a/src/threading.h b/src/threading.h +index 50d4b49..f6344ad 100644 +--- a/src/threading.h ++++ b/src/threading.h +@@ -564,8 +564,8 @@ struct alignas(64) thread_counter { + } + }; + +-thread_counter master_counter[100]; +-thread_counter slave_counter[100]; ++thread_counter master_counter[512]; ++thread_counter slave_counter[512]; + + struct thread_state { + int pairindex; +diff --git a/src/vdf.h b/src/vdf.h +index 9ab4aef..4544fe2 100644 +--- a/src/vdf.h ++++ b/src/vdf.h +@@ -78,6 +78,18 @@ std::mutex new_event_mutex, cout_lock; + bool debug_mode = false; + bool fast_algorithm = false; + bool two_weso = false; ++bool quiet_mode = false; ++ ++// vdf_fast uses shared master/slave counters keyed by `square_state.pairindex`. ++// The upstream chiavdf binaries run one VDF per process and hardcode `pairindex=0`. ++// In embedded/multi-worker setups (like WesoForge), multiple VDF computations can ++// run concurrently in the same process; they must not share a pairindex. ++inline int vdf_fast_pairindex() { ++ constexpr int kSlots = int(sizeof(master_counter) / sizeof(master_counter[0])); ++ static std::atomic next_slot{0}; ++ thread_local int slot = next_slot.fetch_add(1, std::memory_order_relaxed) % kSlots; ++ return slot; ++} + + //always works + void repeated_square_original(vdf_original &vdfo, form& f, const integer& D, const integer& L, uint64 base, uint64 iterations, INUDUPLListener *nuduplListener) { +@@ -137,7 +149,7 @@ void repeated_square(uint64_t iterations, form f, const integer& D, const intege + + // This works single threaded + square_state_type square_state; +- square_state.pairindex=0; ++ square_state.pairindex=vdf_fast_pairindex(); + + uint64 actual_iterations=repeated_square_fast(square_state, f, D, L, num_iterations, batch_size, weso); + +@@ -236,10 +248,12 @@ void repeated_square(uint64_t iterations, form f, const integer& D, const intege + } + #endif + } +- { +- // this shouldn't be needed but avoids some false positive in TSAN +- std::lock_guard lk(cout_lock); +- std::cout << "VDF loop finished. Total iters: " << num_iterations << "\n" << std::flush; ++ if (!quiet_mode) { ++ { ++ // this shouldn't be needed but avoids some false positive in TSAN ++ std::lock_guard lk(cout_lock); ++ std::cout << "VDF loop finished. Total iters: " << num_iterations << "\n" << std::flush; ++ } + } + + #ifdef VDF_TEST +@@ -275,11 +289,6 @@ Proof ProveOneWesolowski(uint64_t iters, integer& D, form f, OneWesolowskiCallba + proof_serialized = SerializeForm(proof_form, d_bits); + Proof proof(y_serialized, proof_serialized); + proof.witness_type = 0; +- { +- // this shouldn't be needed but avoids some false positive in TSAN +- std::lock_guard lk(cout_lock); +- std::cout << "Got simple weso proof: " << proof.hex() << "\n"; +- } + return proof; + } + +diff --git a/docs/bluebox_compaction.md b/docs/bluebox_compaction.md +new file mode 100644 +index 0000000..61cd1fd +--- /dev/null ++++ b/docs/bluebox_compaction.md +@@ -0,0 +1,49 @@ ++# Bluebox Compaction Optimizations ++ ++This document describes the compaction-oriented proving path exposed by ++`src/c_bindings/fast_wrapper.h` and implemented in ++`src/c_bindings/fast_wrapper.cpp`. ++ ++## Scope ++ ++These APIs are intended for workloads where the expected VDF output (`y_ref`) is ++already known up front (for example, bluebox compaction jobs). They are additive ++and do not change the existing `c_wrapper` APIs. ++ ++## Optimization 1: Streaming one-wesolowski ++ ++Given `y_ref`, the prover computes: ++ ++- `B = GetB(D, x, y_ref)` before squaring starts ++ ++This enables a streaming algorithm that updates proof buckets at each ++checkpoint during repeated squaring, instead of materializing the full ++intermediate checkpoint array and scanning it after the loop. In practice this ++substantially reduces memory usage for compaction workloads. ++ ++## Optimization 2: Incremental GetBlock mapping ++ ++For streaming checkpoint updates, bucket index selection repeatedly calls ++`GetBlock(p, k, T, B)`. The optimized mode keeps a rolling modular state and ++advances sequential `p` values incrementally, avoiding full modular ++exponentiation per call and avoiding a large lookup table. ++ ++## Optimization 3: Memory-budgeted (k, l) tuning ++ ++The wrapper can tune `(k, l)` under a configured memory budget: ++ ++- `chiavdf_set_bucket_memory_budget_bytes(...)` ++ ++If no tuned candidate is found, the code falls back to the standard parameter ++heuristics. ++ ++## Operational Notes ++ ++- The `fast_wrapper` code path sets one-wesolowski mode and uses `quiet_mode` to ++ avoid unsolicited stdout noise when embedded in multi-worker clients. ++- Thread-slot assignment for the fast VDF counters is per-thread via ++ `vdf_fast_pairindex()`, avoiding slot collisions when multiple VDF computations ++ run in one process. ++- The production default for `enable_threads` in `parameters.h` is unchanged from ++ upstream to preserve timelord expectations. ++ diff --git a/src/c_bindings/fast_wrapper.cpp b/src/c_bindings/fast_wrapper.cpp index 61d24599..d660ee80 100644 --- a/src/c_bindings/fast_wrapper.cpp +++ b/src/c_bindings/fast_wrapper.cpp @@ -1,7 +1,9 @@ #include "fast_wrapper.h" #include +#include #include +#include #include #include #include @@ -63,6 +65,39 @@ void init_chiavdf_fast() { ChiavdfByteArray empty_result() { return ChiavdfByteArray{nullptr, 0}; } +uint64_t saturating_add_u64(uint64_t lhs, uint64_t rhs) { + if (lhs > std::numeric_limits::max() - rhs) { + return std::numeric_limits::max(); + } + return lhs + rhs; +} + +void free_byte_array_batch_internal(ChiavdfByteArray* arrays, size_t count) { + if (arrays == nullptr) { + return; + } + for (size_t idx = 0; idx < count; ++idx) { + delete[] arrays[idx].data; + arrays[idx].data = nullptr; + arrays[idx].length = 0; + } + delete[] arrays; +} + +struct BatchProgressContext { + uint64_t completed_before = 0; + ChiavdfProgressCallback progress_cb = nullptr; + void* progress_user_data = nullptr; +}; + +void batch_progress_trampoline(uint64_t iters_done, void* user_data) { + auto* ctx = static_cast(user_data); + if (ctx == nullptr || ctx->progress_cb == nullptr) { + return; + } + ctx->progress_cb(saturating_add_u64(ctx->completed_before, iters_done), ctx->progress_user_data); +} + uint64_t estimate_bucket_form_bytes(size_t discriminant_size_bits) { // Be conservative: class group forms contain 3 GMP-backed integers that // quickly grow to the discriminant size (or beyond) during NUCOMP. @@ -96,6 +131,13 @@ bool tune_streaming_parameters( unsigned __int128 best_cost = std::numeric_limits::max(); bool found = false; +#ifndef NDEBUG + uint32_t best_k = 0; + uint32_t best_l = 0; + unsigned __int128 best_updates = 0; + unsigned __int128 best_checkpoints = 0; + unsigned __int128 best_fold = 0; +#endif // Empirical tuning notes (1024-bit discriminants, AVX2 build): // - Each bucket update (NUCOMP) and each fold unit is ~5µs. @@ -133,10 +175,42 @@ bool tune_streaming_parameters( best_cost = cost; out_k = k; out_l = l; +#ifndef NDEBUG + best_k = k; + best_l = l; + best_updates = updates; + best_checkpoints = checkpoints; + best_fold = fold; +#endif } } } +#ifndef NDEBUG + if (found) { + assert(best_k >= 4 && best_k <= 20); + assert(best_l >= 1 && best_l <= 64); + std::fprintf( + stderr, + "[chiavdf] tune_streaming_parameters: T=%llu, budget=%llu, selected=(k=%u,l=%u), " + "components{updates=%llu, checkpoints=%llu, fold=%llu}, weights{u=16,c=1,f=16}\n", + static_cast(num_iterations), + static_cast(memory_budget_bytes), + best_k, + best_l, + static_cast(best_updates), + static_cast(best_checkpoints), + static_cast(best_fold)); + if (best_k == 20 && num_iterations < (1ULL << 24)) { + std::fprintf( + stderr, + "[chiavdf] tune_streaming_parameters: high-k selection for moderate T " + "(k=20, T=%llu); verify measured update/fold timing assumptions.\n", + static_cast(num_iterations)); + } + } +#endif + return found; } @@ -331,6 +405,7 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback { } } + public: bool init_ok() const { return getblock_ok; } bool ok() const { return has_result; } @@ -401,6 +476,7 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback { return out; } + private: form& bucket(uint32_t j, uint64_t b) { size_t idx = static_cast(j) * (1ULL << k) + static_cast(b); return buckets[idx]; @@ -836,4 +912,96 @@ extern "C" bool chiavdf_get_last_streaming_stats( return true; } +extern "C" ChiavdfByteArray* chiavdf_prove_one_weso_fast_streaming_getblock_opt_batch_with_progress( + const uint8_t* challenge_hash, + size_t challenge_size, + const uint8_t* x_s, + size_t x_s_size, + size_t discriminant_size_bits, + const ChiavdfBatchJob* jobs, + size_t job_count, + uint64_t progress_interval, + ChiavdfProgressCallback progress_cb, + void* progress_user_data) { + if (challenge_hash == nullptr || challenge_size == 0 || x_s == nullptr || x_s_size == 0) { + return nullptr; + } + if (discriminant_size_bits == 0 || jobs == nullptr || job_count == 0) { + return nullptr; + } + + ChiavdfByteArray* out_arrays = nullptr; + try { + out_arrays = new ChiavdfByteArray[job_count]; + for (size_t idx = 0; idx < job_count; ++idx) { + out_arrays[idx] = empty_result(); + } + + uint64_t completed_iters = 0; + for (size_t idx = 0; idx < job_count; ++idx) { + const ChiavdfBatchJob& job = jobs[idx]; + if (job.y_ref_s == nullptr || job.y_ref_s_size == 0 || job.num_iterations == 0) { + free_byte_array_batch_internal(out_arrays, job_count); + return nullptr; + } + + BatchProgressContext progress_ctx; + progress_ctx.completed_before = completed_iters; + progress_ctx.progress_cb = progress_cb; + progress_ctx.progress_user_data = progress_user_data; + const bool use_progress = progress_cb != nullptr && progress_interval != 0; + + out_arrays[idx] = chiavdf_prove_one_weso_fast_streaming_getblock_opt_with_progress( + challenge_hash, + challenge_size, + x_s, + x_s_size, + job.y_ref_s, + job.y_ref_s_size, + discriminant_size_bits, + job.num_iterations, + progress_interval, + use_progress ? batch_progress_trampoline : nullptr, + use_progress ? static_cast(&progress_ctx) : nullptr); + + if (out_arrays[idx].data == nullptr || out_arrays[idx].length == 0) { + free_byte_array_batch_internal(out_arrays, job_count); + return nullptr; + } + + completed_iters = saturating_add_u64(completed_iters, job.num_iterations); + } + + return out_arrays; + } catch (...) { + free_byte_array_batch_internal(out_arrays, job_count); + return nullptr; + } +} + +extern "C" ChiavdfByteArray* chiavdf_prove_one_weso_fast_streaming_getblock_opt_batch( + const uint8_t* challenge_hash, + size_t challenge_size, + const uint8_t* x_s, + size_t x_s_size, + size_t discriminant_size_bits, + const ChiavdfBatchJob* jobs, + size_t job_count) { + return chiavdf_prove_one_weso_fast_streaming_getblock_opt_batch_with_progress( + challenge_hash, + challenge_size, + x_s, + x_s_size, + discriminant_size_bits, + jobs, + job_count, + /*progress_interval=*/0, + /*progress_cb=*/nullptr, + /*progress_user_data=*/nullptr); +} + +extern "C" void chiavdf_free_byte_array_batch(ChiavdfByteArray* arrays, size_t count) { + free_byte_array_batch_internal(arrays, count); +} + extern "C" void chiavdf_free_byte_array(ChiavdfByteArray array) { delete[] array.data; } diff --git a/src/c_bindings/fast_wrapper.h b/src/c_bindings/fast_wrapper.h index bf33f320..115c3abd 100644 --- a/src/c_bindings/fast_wrapper.h +++ b/src/c_bindings/fast_wrapper.h @@ -13,6 +13,12 @@ typedef struct { size_t length; } ChiavdfByteArray; +typedef struct { + const uint8_t* y_ref_s; + size_t y_ref_s_size; + uint64_t num_iterations; +} ChiavdfBatchJob; + typedef void (*ChiavdfProgressCallback)(uint64_t iters_done, void* user_data); // Configure the per-process memory budget used by the parameter tuner when @@ -138,6 +144,32 @@ ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt_with_progres ChiavdfProgressCallback progress_cb, void* progress_user_data); +// Batch variant: computes one proof per `jobs[i]` using a shared API surface. +// Returns an array of `job_count` results on success; caller owns/frees it. +ChiavdfByteArray* chiavdf_prove_one_weso_fast_streaming_getblock_opt_batch( + const uint8_t* challenge_hash, + size_t challenge_size, + const uint8_t* x_s, + size_t x_s_size, + size_t discriminant_size_bits, + const ChiavdfBatchJob* jobs, + size_t job_count); + +// Same as batch API above, with optional aggregate progress callback. +ChiavdfByteArray* chiavdf_prove_one_weso_fast_streaming_getblock_opt_batch_with_progress( + const uint8_t* challenge_hash, + size_t challenge_size, + const uint8_t* x_s, + size_t x_s_size, + size_t discriminant_size_bits, + const ChiavdfBatchJob* jobs, + size_t job_count, + uint64_t progress_interval, + ChiavdfProgressCallback progress_cb, + void* progress_user_data); + +void chiavdf_free_byte_array_batch(ChiavdfByteArray* arrays, size_t count); + void chiavdf_free_byte_array(ChiavdfByteArray array); #ifdef __cplusplus From 95f8ff18d3adcfd291767fbc35273a202d6c645d Mon Sep 17 00:00:00 2001 From: Gene Hoffman Date: Tue, 24 Feb 2026 00:51:52 -0800 Subject: [PATCH 07/13] Make fast-thread counter slots build-configurable. Default CHIA_VDF_FAST_COUNTER_SLOTS to 100 in threading.h so upstream builds keep lower BSS usage while allowing embedded deployments to override via compiler defines. Co-authored-by: Cursor --- src/threading.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/threading.h b/src/threading.h index 8354d824..dbb18592 100644 --- a/src/threading.h +++ b/src/threading.h @@ -566,8 +566,12 @@ struct alignas(64) thread_counter { } }; -thread_counter master_counter[512]; -thread_counter slave_counter[512]; +#ifndef CHIA_VDF_FAST_COUNTER_SLOTS +#define CHIA_VDF_FAST_COUNTER_SLOTS 100 +#endif + +thread_counter master_counter[CHIA_VDF_FAST_COUNTER_SLOTS]; +thread_counter slave_counter[CHIA_VDF_FAST_COUNTER_SLOTS]; struct thread_state { int pairindex; From 746ba2e8edadadb0e81951c4bf93aaf8a5a3dbe0 Mon Sep 17 00:00:00 2001 From: Gene Hoffman Date: Tue, 24 Feb 2026 00:55:50 -0800 Subject: [PATCH 08/13] Fix fast pairindex allocator state across translation units. Use one program-wide atomic slot allocator for `vdf_fast_pairindex()` so concurrent VDF computations started from different translation units cannot collide on shared fast counter slots. Co-authored-by: Cursor --- src/vdf.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/vdf.h b/src/vdf.h index 8ca75d8c..575cc78a 100644 --- a/src/vdf.h +++ b/src/vdf.h @@ -93,11 +93,16 @@ bool quiet_mode = false; // The upstream chiavdf binaries run one VDF per process and hardcode `pairindex=0`. // In embedded/multi-worker setups (like WesoForge), multiple VDF computations can // run concurrently in the same process; they must not share a pairindex. +#if (defined(ARCH_X86) || defined(ARCH_X64)) && !defined(CHIA_DISABLE_ASM) +// Keep slot allocation state as one program-wide entity for all TUs that include +// this header, so concurrent callers cannot recycle the same slot sequence. +inline std::atomic vdf_fast_next_slot{0}; +#endif + inline int vdf_fast_pairindex() { #if (defined(ARCH_X86) || defined(ARCH_X64)) && !defined(CHIA_DISABLE_ASM) constexpr unsigned int kSlots = unsigned(sizeof(master_counter) / sizeof(master_counter[0])); - static std::atomic next_slot{0}; - thread_local int slot = int(next_slot.fetch_add(1u, std::memory_order_relaxed) % kSlots); + thread_local int slot = int(vdf_fast_next_slot.fetch_add(1u, std::memory_order_relaxed) % kSlots); return slot; #else return 0; From 707b2f46e3499b2ba65547693401ceb7a4f069a3 Mon Sep 17 00:00:00 2001 From: Gene Hoffman Date: Tue, 24 Feb 2026 00:58:53 -0800 Subject: [PATCH 09/13] Guard streaming prover bucket shifts against invalid k. Reject k>=64 before any 64-bit left-shift and reuse validated bucket spans for allocation, indexing, and finalization loops so invalid parameter tuning cannot trigger undefined behavior. Co-authored-by: Cursor --- src/c_bindings/fast_wrapper.cpp | 53 ++++++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 11 deletions(-) diff --git a/src/c_bindings/fast_wrapper.cpp b/src/c_bindings/fast_wrapper.cpp index d660ee80..af3bf805 100644 --- a/src/c_bindings/fast_wrapper.cpp +++ b/src/c_bindings/fast_wrapper.cpp @@ -72,6 +72,14 @@ uint64_t saturating_add_u64(uint64_t lhs, uint64_t rhs) { return lhs + rhs; } +bool try_pow2_u64_shift(uint32_t shift, uint64_t& out) { + if (shift >= 64) { + return false; + } + out = 1ULL << shift; + return true; +} + void free_byte_array_batch_internal(ChiavdfByteArray* arrays, size_t count) { if (arrays == nullptr) { return; @@ -289,7 +297,19 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback { use_getblock_opt(use_getblock_opt), stats_enabled(streaming_stats_enabled.load(std::memory_order_relaxed)) { form id = form::identity(D); - buckets.resize(static_cast(l) * (1ULL << k), id); + uint64_t bucket_span_u64 = 0; + if (!try_pow2_u64_shift(k, bucket_span_u64)) { + getblock_ok = false; + return; + } + + bucket_span = static_cast(bucket_span_u64); + if (bucket_span != 0 && static_cast(l) > std::numeric_limits::max() / bucket_span) { + getblock_ok = false; + return; + } + + buckets.resize(static_cast(l) * bucket_span, id); if (use_getblock_opt) { getblock_ok = init_getblock_opt_state(); @@ -423,29 +443,35 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback { uint64_t k1 = k / 2; uint64_t k0 = k - k1; + uint64_t span_k0 = 0; + uint64_t span_k1 = 0; + if (!try_pow2_u64_shift(static_cast(k0), span_k0) || + !try_pow2_u64_shift(static_cast(k1), span_k1)) { + return form::identity(D); + } form x = id; for (int64_t j = static_cast(l) - 1; j >= 0; j--) { - x = FastPowFormNucomp(x, D, integer(static_cast(1) << k), L, reducer); + x = FastPowFormNucomp(x, D, integer(static_cast(bucket_span)), L, reducer); - for (uint64_t b1 = 0; b1 < (1ULL << k1); b1++) { + for (uint64_t b1 = 0; b1 < span_k1; b1++) { form z = id; - for (uint64_t b0 = 0; b0 < (1ULL << k0); b0++) { - nucomp_form(z, z, bucket(static_cast(j), b1 * (1ULL << k0) + b0), D, L); + for (uint64_t b0 = 0; b0 < span_k0; b0++) { + nucomp_form(z, z, bucket(static_cast(j), b1 * span_k0 + b0), D, L); } z = FastPowFormNucomp( z, D, - integer(static_cast(b1 * (1ULL << k0))), + integer(static_cast(b1 * span_k0)), L, reducer); nucomp_form(x, x, z, D, L); } - for (uint64_t b0 = 0; b0 < (1ULL << k0); b0++) { + for (uint64_t b0 = 0; b0 < span_k0; b0++) { form z = id; - for (uint64_t b1 = 0; b1 < (1ULL << k1); b1++) { - nucomp_form(z, z, bucket(static_cast(j), b1 * (1ULL << k0) + b0), D, L); + for (uint64_t b1 = 0; b1 < span_k1; b1++) { + nucomp_form(z, z, bucket(static_cast(j), b1 * span_k0 + b0), D, L); } z = FastPowFormNucomp(z, D, integer(b0), L, reducer); nucomp_form(x, x, z, D, L); @@ -478,12 +504,12 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback { private: form& bucket(uint32_t j, uint64_t b) { - size_t idx = static_cast(j) * (1ULL << k) + static_cast(b); + size_t idx = static_cast(j) * bucket_span + static_cast(b); return buckets[idx]; } const form& bucket(uint32_t j, uint64_t b) const { - size_t idx = static_cast(j) * (1ULL << k) + static_cast(b); + size_t idx = static_cast(j) * bucket_span + static_cast(b); return buckets[idx]; } @@ -497,6 +523,7 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback { ChiavdfProgressCallback progress_cb; void* progress_user_data; uint64_t next_progress; + size_t bucket_span = 0; std::vector buckets; form result; @@ -620,6 +647,10 @@ ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_impl( if (l == 0) { l = 1; } + uint64_t ignored_bucket_span = 0; + if (!try_pow2_u64_shift(k, ignored_bucket_span)) { + return empty_result(); + } last_streaming_parameters.k = k; last_streaming_parameters.l = l; From 0c11002ba3aabb9cdb2f65bf31d9c8cd5393fb73 Mon Sep 17 00:00:00 2001 From: Gene Hoffman Date: Tue, 12 May 2026 19:17:42 -0700 Subject: [PATCH 10/13] Harden fast counter slot safety and macOS cmake setup. Add compile-time guards that reject zero fast-counter slot configurations before modulo indexing, and export Homebrew's cmake path in macOS workflows so cmake is available within the same step on Intel runners. Co-authored-by: Cursor --- .github/workflows/build-c-libraries.yml | 1 + .github/workflows/build.yml | 1 + src/threading.h | 2 ++ src/vdf.h | 1 + 4 files changed, 5 insertions(+) diff --git a/.github/workflows/build-c-libraries.yml b/.github/workflows/build-c-libraries.yml index db833104..451fabee 100644 --- a/.github/workflows/build-c-libraries.yml +++ b/.github/workflows/build-c-libraries.yml @@ -90,6 +90,7 @@ jobs: CMAKE_BIN="$(brew --prefix cmake)/bin" if [ -d "$CMAKE_BIN" ]; then echo "$CMAKE_BIN" >> "$GITHUB_PATH" + export PATH="$CMAKE_BIN:$PATH" fi cmake --version diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 4ad967ec..798241d7 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -110,6 +110,7 @@ jobs: CMAKE_BIN="$(brew --prefix cmake)/bin" if [ -d "$CMAKE_BIN" ]; then echo "$CMAKE_BIN" >> "$GITHUB_PATH" + export PATH="$CMAKE_BIN:$PATH" fi cmake --version diff --git a/src/threading.h b/src/threading.h index dbb18592..3574a98f 100644 --- a/src/threading.h +++ b/src/threading.h @@ -570,6 +570,8 @@ struct alignas(64) thread_counter { #define CHIA_VDF_FAST_COUNTER_SLOTS 100 #endif +static_assert(CHIA_VDF_FAST_COUNTER_SLOTS > 0, "CHIA_VDF_FAST_COUNTER_SLOTS must be > 0"); + thread_counter master_counter[CHIA_VDF_FAST_COUNTER_SLOTS]; thread_counter slave_counter[CHIA_VDF_FAST_COUNTER_SLOTS]; diff --git a/src/vdf.h b/src/vdf.h index 575cc78a..eb1d0d39 100644 --- a/src/vdf.h +++ b/src/vdf.h @@ -102,6 +102,7 @@ inline std::atomic vdf_fast_next_slot{0}; inline int vdf_fast_pairindex() { #if (defined(ARCH_X86) || defined(ARCH_X64)) && !defined(CHIA_DISABLE_ASM) constexpr unsigned int kSlots = unsigned(sizeof(master_counter) / sizeof(master_counter[0])); + static_assert(kSlots > 0, "CHIA_VDF_FAST_COUNTER_SLOTS must be > 0"); thread_local int slot = int(vdf_fast_next_slot.fetch_add(1u, std::memory_order_relaxed) % kSlots); return slot; #else From 1e7342199beb8e0d199f2201e6040ef5b4f6ca94 Mon Sep 17 00:00:00 2001 From: Gene Hoffman Date: Tue, 12 May 2026 19:37:23 -0700 Subject: [PATCH 11/13] Remove stale patch artifact and refine tuner update cost. Drop the root-level development patch file that diverged from the live implementation, and adjust the streaming tuner cost model so bucket-update work scales with checkpoint count and `l` instead of only `k`. Co-authored-by: Cursor --- pr1_upstream_ready.patch | 1158 ------------------------------- src/c_bindings/fast_wrapper.cpp | 5 +- 2 files changed, 3 insertions(+), 1160 deletions(-) delete mode 100644 pr1_upstream_ready.patch diff --git a/pr1_upstream_ready.patch b/pr1_upstream_ready.patch deleted file mode 100644 index b14a93bb..00000000 --- a/pr1_upstream_ready.patch +++ /dev/null @@ -1,1158 +0,0 @@ -diff --git a/src/Makefile.vdf-client b/src/Makefile.vdf-client -index ed41963..ca55a95 100644 ---- a/src/Makefile.vdf-client -+++ b/src/Makefile.vdf-client -@@ -6,9 +6,24 @@ else - NOPIE = -no-pie - endif - --LDFLAGS += -flto $(NOPIE) -g -+# Optional: override `LTO=` to disable link-time optimization. -+LTO ?= -flto -+ -+# Optional: set `PIC=1` to build position-independent objects (recommended when -+# linking chiavdf code into other PIE/shared-library binaries). -+PIC ?= 0 -+ifeq ($(PIC),1) -+PICFLAGS = -fPIC -+PIEFLAGS = -+else -+PICFLAGS = -+PIEFLAGS = $(NOPIE) -+endif -+ -+LDFLAGS += $(LTO) $(PIEFLAGS) -g - LDLIBS += -lgmpxx -lgmp -pthread --CXXFLAGS += -flto -std=c++1z -D VDF_MODE=0 -D FAST_MACHINE=1 -pthread $(NOPIE) -fvisibility=hidden -+CXXFLAGS += $(LTO) -std=c++1z -D VDF_MODE=0 -D FAST_MACHINE=1 -pthread $(PIEFLAGS) $(PICFLAGS) -fvisibility=hidden -+ASFLAGS += $(PICFLAGS) - ifeq ($(UNAME),Darwin) - CXXFLAGS += -D CHIAOSX=1 - endif -@@ -31,7 +46,7 @@ BINS = vdf_client prover_test 1weso_test 2weso_test vdf_bench - all: $(BINS) - - clean: -- rm -f *.o hw/*.o $(BINS) compile_asm emu_hw_test hw_test hw_vdf_client emu_hw_vdf_client -+ rm -f *.o hw/*.o c_bindings/*.o $(BINS) compile_asm emu_hw_test hw_test hw_vdf_client emu_hw_vdf_client libchiavdf_fastc.a - - $(BINS) avx512_test: %: %.o lzcnt.o asm_compiled.o avx2_asm_compiled.o avx512_asm_compiled.o - $(CXX) $(LDFLAGS) -o $@ $^ $(LDLIBS) -@@ -39,7 +54,10 @@ $(BINS) avx512_test: %: %.o lzcnt.o asm_compiled.o avx2_asm_compiled.o avx512_as - $(addsuffix .o,$(BINS)) avx512_test.o: CXXFLAGS += $(OPT_CFLAGS) - - lzcnt.o: refcode/lzcnt.c -- $(CC) -c refcode/lzcnt.c -+ $(CC) -c refcode/lzcnt.c $(OPT_CFLAGS) $(PICFLAGS) -+ -+%.o: %.s -+ $(CC) -c $< -o $@ $(ASFLAGS) - - asm_compiled.s: compile_asm - ./compile_asm -@@ -53,6 +71,22 @@ avx512_asm_compiled.s: compile_asm - compile_asm: compile_asm.o - $(CXX) $(LDFLAGS) -o $@ $^ $(LDLIBS) - -+# --------------------------------------------------------------------------- -+# Static library: fast one-wesolowski proof (BBR integration) -+# --------------------------------------------------------------------------- -+ -+FASTLIB = libchiavdf_fastc.a -+FASTLIB_OBJS = c_bindings/fast_wrapper.o lzcnt.o asm_compiled.o avx2_asm_compiled.o avx512_asm_compiled.o -+ -+.PHONY: fastlib -+ -+fastlib: $(FASTLIB) -+ -+$(FASTLIB): $(FASTLIB_OBJS) -+ $(AR) rcs $@ $^ -+ -+c_bindings/fast_wrapper.o: CXXFLAGS += $(OPT_CFLAGS) -+ - HW_OBJS = $(addprefix hw/,hw_util.o hw_proof.o hw_interface.o chia_driver.o ftdi_driver.o vdf_driver.o pll_freqs.o) vdf_base.o lzcnt.o - EMU_OBJS = hw/emu_funcs.o hw/emu_runner.o - HW_LIB = hw/libft4222/build-x86_64/libft4222.so -diff --git a/src/c_bindings/fast_wrapper.cpp b/src/c_bindings/fast_wrapper.cpp -new file mode 100644 -index 0000000..198d0a8 ---- /dev/null -+++ b/src/c_bindings/fast_wrapper.cpp -@@ -0,0 +1,795 @@ -+#include "fast_wrapper.h" -+ -+#include -+#include -+#include -+#include -+#include -+ -+#include "../vdf.h" -+#include "../create_discriminant.h" -+ -+// Runtime configuration knobs required by `parameters.h`. -+// These are `extern` variables there, but each binary defines them explicitly. -+bool use_divide_table = false; -+int gcd_base_bits = 50; -+int gcd_128_max_iter = 3; -+std::string asmprefix = "cel_"; -+bool enable_all_instructions = false; -+ -+namespace { -+std::once_flag init_once; -+std::atomic bucket_memory_budget_bytes(128ULL * 1024ULL * 1024ULL); -+std::atomic streaming_stats_enabled(false); -+ -+struct LastStreamingParameters { -+ uint32_t k = 0; -+ uint32_t l = 0; -+ bool tuned = false; -+ bool set = false; -+}; -+ -+thread_local LastStreamingParameters last_streaming_parameters; -+ -+struct LastStreamingStats { -+ uint64_t checkpoint_total_ns = 0; -+ uint64_t checkpoint_event_total_ns = 0; -+ uint64_t finalize_total_ns = 0; -+ uint64_t checkpoint_calls = 0; -+ uint64_t bucket_updates = 0; -+ bool set = false; -+}; -+ -+thread_local LastStreamingStats last_streaming_stats; -+ -+void init_chiavdf_fast() { -+ init_gmp(); -+ set_rounding_mode(); -+ -+ // Match the vdf_client runtime selection for AVX2. -+ if (hasAVX2()) { -+ gcd_base_bits = 63; -+ gcd_128_max_iter = 2; -+ } else { -+ gcd_base_bits = 50; -+ gcd_128_max_iter = 3; -+ } -+ -+ // Ensure we run the one-wesolowski path by default. -+ fast_algorithm = false; -+ two_weso = false; -+ quiet_mode = true; -+} -+ -+ChiavdfByteArray empty_result() { return ChiavdfByteArray{nullptr, 0}; } -+ -+uint64_t estimate_bucket_form_bytes(size_t discriminant_size_bits) { -+ // Be conservative: class group forms contain 3 GMP-backed integers that -+ // quickly grow to the discriminant size (or beyond) during NUCOMP. -+ // -+ // This estimate is intentionally larger than the raw serialized size to -+ // avoid picking parameters that risk paging/OOM. -+ uint64_t discr_bytes = (static_cast(discriminant_size_bits) + 7) / 8; -+ uint64_t estimate = discr_bytes * 16; -+ if (estimate < 2048) { -+ estimate = 2048; -+ } -+ return estimate; -+} -+ -+bool tune_streaming_parameters( -+ uint64_t num_iterations, -+ size_t discriminant_size_bits, -+ uint64_t memory_budget_bytes, -+ uint32_t& out_l, -+ uint32_t& out_k) { -+ if (memory_budget_bytes == 0) { -+ return false; -+ } -+ -+ // Keep headroom for GMP scratch allocations and general process overhead. -+ uint64_t budget = (memory_budget_bytes * 80) / 100; -+ uint64_t bytes_per_form = estimate_bucket_form_bytes(discriminant_size_bits); -+ if (budget < bytes_per_form) { -+ return false; -+ } -+ -+ unsigned __int128 best_cost = std::numeric_limits::max(); -+ bool found = false; -+ -+ // Empirical tuning notes (1024-bit discriminants, AVX2 build): -+ // - Each bucket update (NUCOMP) and each fold unit is ~5µs. -+ // - Per-checkpoint event overhead (SetForm + bookkeeping) is ~0.3µs. -+ // -+ // So checkpoint counts should be weighted much lower than updates/fold. -+ constexpr unsigned __int128 update_weight = 16; -+ constexpr unsigned __int128 fold_weight = 16; -+ constexpr unsigned __int128 checkpoint_weight = 1; -+ -+ // Search a small grid of `(k,l)` values. Higher `k` reduces checkpoint work -+ // (~T/k) but increases fold work (~l·2^k) and bucket memory (~l·2^k). -+ for (uint32_t k = 4; k <= 20; k++) { -+ unsigned __int128 buckets_per_row = static_cast(1) << k; -+ -+ for (uint32_t l = 1; l <= 64; l++) { -+ unsigned __int128 form_count = buckets_per_row * static_cast(l); -+ unsigned __int128 mem_required = -+ form_count * static_cast(bytes_per_form); -+ if (mem_required > static_cast(budget)) { -+ continue; -+ } -+ -+ unsigned __int128 updates = static_cast( -+ (num_iterations + static_cast(k) - 1) / static_cast(k)); -+ uint64_t kl = static_cast(k) * static_cast(l); -+ unsigned __int128 checkpoints = static_cast( -+ (num_iterations + kl - 1) / kl); -+ unsigned __int128 fold = static_cast(l) << (k + 1); -+ unsigned __int128 cost = -+ updates * update_weight + checkpoints * checkpoint_weight + fold * fold_weight; -+ -+ if (!found || cost < best_cost) { -+ found = true; -+ best_cost = cost; -+ out_k = k; -+ out_l = l; -+ } -+ } -+ } -+ -+ return found; -+} -+ -+uint64_t get_block(uint64_t i, uint64_t k, uint64_t T, integer& B) { -+ integer res = FastPow(2, T - k * (i + 1), B); -+ mpz_mul_2exp(res.impl, res.impl, k); -+ res = res / B; -+ auto res_vector = res.to_vector(); -+ return res_vector.empty() ? 0 : res_vector[0]; -+} -+ -+class ProgressOneWesolowskiCallback final : public OneWesolowskiCallback { -+ public: -+ ProgressOneWesolowskiCallback( -+ integer& D, -+ form& f, -+ uint64_t wanted_iter, -+ uint64_t progress_interval, -+ ChiavdfProgressCallback progress_cb, -+ void* progress_user_data) -+ : OneWesolowskiCallback(D, f, wanted_iter), -+ progress_interval(progress_interval), -+ progress_cb(progress_cb), -+ progress_user_data(progress_user_data), -+ next_progress(progress_interval) {} -+ -+ void OnIteration(int type, void* data, uint64_t iteration) override { -+ OneWesolowskiCallback::OnIteration(type, data, iteration); -+ -+ if (progress_cb == nullptr || progress_interval == 0) { -+ return; -+ } -+ -+ uint64_t done = iteration + 1; -+ if (done > wanted_iter) { -+ return; -+ } -+ -+ if (done >= next_progress) { -+ progress_cb(next_progress, progress_user_data); -+ next_progress += progress_interval; -+ } -+ } -+ -+ private: -+ uint64_t progress_interval; -+ ChiavdfProgressCallback progress_cb; -+ void* progress_user_data; -+ uint64_t next_progress; -+}; -+ -+class StreamingOneWesolowskiCallback final : public WesolowskiCallback { -+ public: -+ StreamingOneWesolowskiCallback( -+ integer& D, -+ uint64_t wanted_iter, -+ uint32_t k, -+ uint32_t l, -+ uint64_t limit, -+ integer& B, -+ bool use_getblock_opt, -+ uint64_t progress_interval, -+ ChiavdfProgressCallback progress_cb, -+ void* progress_user_data) -+ : WesolowskiCallback(D), -+ wanted_iter(wanted_iter), -+ k(k), -+ l(l), -+ kl(static_cast(k) * static_cast(l)), -+ limit(limit), -+ B(B), -+ progress_interval(progress_interval), -+ progress_cb(progress_cb), -+ progress_user_data(progress_user_data), -+ next_progress(progress_interval), -+ use_getblock_opt(use_getblock_opt), -+ stats_enabled(streaming_stats_enabled.load(std::memory_order_relaxed)) { -+ form id = form::identity(D); -+ buckets.resize(static_cast(l) * (1ULL << k), id); -+ -+ if (use_getblock_opt) { -+ getblock_ok = init_getblock_opt_state(); -+ } -+ } -+ -+ void OnIteration(int type, void* data, uint64_t iteration) override { -+ iteration++; -+ if (iteration > wanted_iter) { -+ return; -+ } -+ -+ if (progress_cb != nullptr && progress_interval != 0 && iteration >= next_progress) { -+ progress_cb(next_progress, progress_user_data); -+ next_progress += progress_interval; -+ } -+ -+ if (iteration % kl == 0) { -+ uint64_t pos = iteration / kl; -+ if (pos < limit) { -+ form checkpoint; -+ auto started_at = std::chrono::steady_clock::time_point{}; -+ if (stats_enabled) { -+ started_at = std::chrono::steady_clock::now(); -+ } -+ SetForm(type, data, &checkpoint); -+ process_checkpoint(pos, checkpoint, /*record_stats=*/true); -+ if (stats_enabled) { -+ checkpoint_event_total_ns += static_cast( -+ std::chrono::duration_cast( -+ std::chrono::steady_clock::now() - started_at) -+ .count()); -+ } -+ } -+ } -+ -+ if (iteration == wanted_iter) { -+ SetForm(type, data, &result); -+ has_result = true; -+ } -+ } -+ -+ void process_checkpoint(uint64_t i, const form& checkpoint, bool record_stats) { -+ const bool do_stats = stats_enabled && record_stats; -+ auto started_at = std::chrono::steady_clock::time_point{}; -+ if (do_stats) { -+ started_at = std::chrono::steady_clock::now(); -+ } -+ -+ uint64_t local_updates = 0; -+ for (uint32_t j = 0; j < l; j++) { -+ uint64_t p = i * static_cast(l) + static_cast(j); -+ uint64_t needed = static_cast(k) * (p + 1); -+ if (wanted_iter < needed) { -+ break; -+ } -+ uint64_t b = use_getblock_opt ? get_block_opt(p) : get_block(p, k, wanted_iter, B); -+ if (do_stats) { -+ local_updates++; -+ } -+ nucomp_form(bucket(j, b), bucket(j, b), checkpoint, D, L); -+ } -+ -+ if (do_stats) { -+ checkpoint_calls++; -+ bucket_updates += local_updates; -+ checkpoint_total_ns += static_cast( -+ std::chrono::duration_cast( -+ std::chrono::steady_clock::now() - started_at) -+ .count()); -+ } -+ } -+ -+ bool init_ok() const { return getblock_ok; } -+ -+ bool ok() const { return has_result; } -+ -+ const form& y() const { return result; } -+ -+ form finalize_proof() { -+ auto started_at = std::chrono::steady_clock::time_point{}; -+ if (stats_enabled) { -+ started_at = std::chrono::steady_clock::now(); -+ } -+ -+ PulmarkReducer reducer; -+ form id = form::identity(D); -+ -+ uint64_t k1 = k / 2; -+ uint64_t k0 = k - k1; -+ form x = id; -+ -+ for (int64_t j = static_cast(l) - 1; j >= 0; j--) { -+ x = FastPowFormNucomp(x, D, integer(static_cast(1) << k), L, reducer); -+ -+ for (uint64_t b1 = 0; b1 < (1ULL << k1); b1++) { -+ form z = id; -+ for (uint64_t b0 = 0; b0 < (1ULL << k0); b0++) { -+ nucomp_form(z, z, bucket(static_cast(j), b1 * (1ULL << k0) + b0), D, L); -+ } -+ z = FastPowFormNucomp( -+ z, -+ D, -+ integer(static_cast(b1 * (1ULL << k0))), -+ L, -+ reducer); -+ nucomp_form(x, x, z, D, L); -+ } -+ -+ for (uint64_t b0 = 0; b0 < (1ULL << k0); b0++) { -+ form z = id; -+ for (uint64_t b1 = 0; b1 < (1ULL << k1); b1++) { -+ nucomp_form(z, z, bucket(static_cast(j), b1 * (1ULL << k0) + b0), D, L); -+ } -+ z = FastPowFormNucomp(z, D, integer(b0), L, reducer); -+ nucomp_form(x, x, z, D, L); -+ } -+ } -+ -+ reducer.reduce(x); -+ -+ if (stats_enabled) { -+ finalize_total_ns += static_cast( -+ std::chrono::duration_cast( -+ std::chrono::steady_clock::now() - started_at) -+ .count()); -+ } -+ return x; -+ } -+ -+ bool stats_ok() const { return stats_enabled; } -+ -+ LastStreamingStats stats() const { -+ LastStreamingStats out; -+ out.checkpoint_total_ns = checkpoint_total_ns; -+ out.checkpoint_event_total_ns = checkpoint_event_total_ns; -+ out.finalize_total_ns = finalize_total_ns; -+ out.checkpoint_calls = checkpoint_calls; -+ out.bucket_updates = bucket_updates; -+ out.set = stats_enabled; -+ return out; -+ } -+ -+ private: -+ form& bucket(uint32_t j, uint64_t b) { -+ size_t idx = static_cast(j) * (1ULL << k) + static_cast(b); -+ return buckets[idx]; -+ } -+ -+ const form& bucket(uint32_t j, uint64_t b) const { -+ size_t idx = static_cast(j) * (1ULL << k) + static_cast(b); -+ return buckets[idx]; -+ } -+ -+ uint64_t wanted_iter; -+ uint32_t k; -+ uint32_t l; -+ uint64_t kl; -+ uint64_t limit; -+ integer B; -+ uint64_t progress_interval; -+ ChiavdfProgressCallback progress_cb; -+ void* progress_user_data; -+ uint64_t next_progress; -+ -+ std::vector buckets; -+ form result; -+ bool has_result = false; -+ -+ bool use_getblock_opt; -+ bool getblock_ok = true; -+ uint64_t getblock_next_p = 0; -+ integer getblock_inv_2k; -+ integer getblock_r; -+ integer getblock_tmp; -+ -+ bool stats_enabled; -+ uint64_t checkpoint_total_ns = 0; -+ uint64_t checkpoint_event_total_ns = 0; -+ uint64_t finalize_total_ns = 0; -+ uint64_t checkpoint_calls = 0; -+ uint64_t bucket_updates = 0; -+ -+ bool init_getblock_opt_state() { -+ if (k == 0) { -+ return false; -+ } -+ uint64_t k_u64 = static_cast(k); -+ if (wanted_iter < k_u64) { -+ return true; -+ } -+ -+ integer two_k_mod = FastPow(2, k_u64, B); -+ if (mpz_invert(getblock_inv_2k.impl, two_k_mod.impl, B.impl) == 0) { -+ return false; -+ } -+ -+ getblock_r = FastPow(2, wanted_iter - k_u64, B); -+ getblock_next_p = 0; -+ return true; -+ } -+ -+ uint64_t get_block_opt(uint64_t p) { -+ if (!getblock_ok || wanted_iter < static_cast(k)) { -+ return get_block(p, k, wanted_iter, B); -+ } -+ -+ // Expected call pattern is sequential `p`. If we ever get out of sync, -+ // advance state forward or fall back to the slow mapping. -+ if (p < getblock_next_p) { -+ return get_block(p, k, wanted_iter, B); -+ } -+ while (getblock_next_p < p) { -+ mpz_mul(getblock_r.impl, getblock_r.impl, getblock_inv_2k.impl); -+ mpz_mod(getblock_r.impl, getblock_r.impl, B.impl); -+ getblock_next_p++; -+ } -+ -+ mpz_mul_2exp(getblock_tmp.impl, getblock_r.impl, k); -+ mpz_fdiv_q(getblock_tmp.impl, getblock_tmp.impl, B.impl); -+ uint64_t b = mpz_get_ui(getblock_tmp.impl); -+ -+ mpz_mul(getblock_r.impl, getblock_r.impl, getblock_inv_2k.impl); -+ mpz_mod(getblock_r.impl, getblock_r.impl, B.impl); -+ getblock_next_p++; -+ -+ return b; -+ } -+}; -+ -+ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_impl( -+ const uint8_t* challenge_hash, -+ size_t challenge_size, -+ const uint8_t* x_s, -+ size_t x_s_size, -+ const uint8_t* y_ref_s, -+ size_t y_ref_s_size, -+ size_t discriminant_size_bits, -+ uint64_t num_iterations, -+ uint64_t progress_interval, -+ ChiavdfProgressCallback progress_cb, -+ void* progress_user_data, -+ bool use_getblock_opt) { -+ std::call_once(init_once, init_chiavdf_fast); -+ -+ last_streaming_stats = LastStreamingStats{}; -+ -+ if (challenge_hash == nullptr || challenge_size == 0 || x_s == nullptr || x_s_size == 0 || -+ y_ref_s == nullptr || y_ref_s_size == 0) { -+ return empty_result(); -+ } -+ if (num_iterations == 0) { -+ return empty_result(); -+ } -+ -+ std::vector challenge_hash_bytes(challenge_hash, challenge_hash + challenge_size); -+ integer D = CreateDiscriminant(challenge_hash_bytes, static_cast(discriminant_size_bits)); -+ integer L = root(-D, 4); -+ -+ form x = DeserializeForm(D, x_s, x_s_size); -+ form y_ref = DeserializeForm(D, y_ref_s, y_ref_s_size); -+ -+ uint32_t k; -+ uint32_t l; -+ bool tuned = false; -+ const uint64_t budget = -+ bucket_memory_budget_bytes.load(std::memory_order_relaxed); -+ if (num_iterations >= (1 << 16)) { -+ tuned = tune_streaming_parameters(num_iterations, discriminant_size_bits, budget, l, k); -+ } -+ if (!tuned) { -+ if (num_iterations >= (1 << 16)) { -+ ApproximateParameters(num_iterations, l, k); -+ } else { -+ k = 10; -+ l = 1; -+ } -+ } -+ if (k == 0) { -+ k = 1; -+ } -+ if (l == 0) { -+ l = 1; -+ } -+ -+ last_streaming_parameters.k = k; -+ last_streaming_parameters.l = l; -+ last_streaming_parameters.tuned = tuned; -+ last_streaming_parameters.set = true; -+ -+ uint64_t kl = static_cast(k) * static_cast(l); -+ uint64_t limit = num_iterations / kl; -+ if (num_iterations % kl) { -+ limit++; -+ } -+ -+ integer B = GetB(D, x, y_ref); -+ -+ std::atomic stopped(false); -+ StreamingOneWesolowskiCallback weso( -+ D, -+ num_iterations, -+ k, -+ l, -+ limit, -+ B, -+ use_getblock_opt, -+ progress_interval, -+ progress_cb, -+ progress_user_data); -+ -+ if (!weso.init_ok()) { -+ return empty_result(); -+ } -+ -+ weso.process_checkpoint(/*i=*/0, x, /*record_stats=*/false); -+ -+ FastStorage* fast_storage = nullptr; -+ repeated_square(num_iterations, x, D, L, &weso, fast_storage, stopped); -+ -+ if (!weso.ok()) { -+ return empty_result(); -+ } -+ if (!(weso.y() == y_ref)) { -+ return empty_result(); -+ } -+ -+ form proof_form = weso.finalize_proof(); -+ -+ if (weso.stats_ok()) { -+ last_streaming_stats = weso.stats(); -+ } -+ -+ int d_bits = D.num_bits(); -+ std::vector y_serialized = SerializeForm(y_ref, d_bits); -+ std::vector proof_serialized = SerializeForm(proof_form, d_bits); -+ -+ if (y_serialized.empty() || proof_serialized.empty()) { -+ return empty_result(); -+ } -+ -+ const size_t total = y_serialized.size() + proof_serialized.size(); -+ uint8_t* out = new uint8_t[total]; -+ std::copy(y_serialized.begin(), y_serialized.end(), out); -+ std::copy(proof_serialized.begin(), proof_serialized.end(), out + y_serialized.size()); -+ return ChiavdfByteArray{out, total}; -+} -+} // namespace -+ -+extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast( -+ const uint8_t* challenge_hash, -+ size_t challenge_size, -+ const uint8_t* x_s, -+ size_t x_s_size, -+ size_t discriminant_size_bits, -+ uint64_t num_iterations) { -+ return chiavdf_prove_one_weso_fast_with_progress( -+ challenge_hash, -+ challenge_size, -+ x_s, -+ x_s_size, -+ discriminant_size_bits, -+ num_iterations, -+ /*progress_interval=*/0, -+ /*progress_cb=*/nullptr, -+ /*progress_user_data=*/nullptr); -+} -+ -+extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_with_progress( -+ const uint8_t* challenge_hash, -+ size_t challenge_size, -+ const uint8_t* x_s, -+ size_t x_s_size, -+ size_t discriminant_size_bits, -+ uint64_t num_iterations, -+ uint64_t progress_interval, -+ ChiavdfProgressCallback progress_cb, -+ void* progress_user_data) { -+ try { -+ std::call_once(init_once, init_chiavdf_fast); -+ -+ if (challenge_hash == nullptr || challenge_size == 0 || x_s == nullptr || x_s_size == 0) { -+ return empty_result(); -+ } -+ if (num_iterations == 0) { -+ return empty_result(); -+ } -+ -+ std::vector challenge_hash_bytes(challenge_hash, challenge_hash + challenge_size); -+ integer D = CreateDiscriminant(challenge_hash_bytes, static_cast(discriminant_size_bits)); -+ integer L = root(-D, 4); -+ -+ form x = DeserializeForm(D, x_s, x_s_size); -+ -+ std::atomic stopped(false); -+ ProgressOneWesolowskiCallback weso( -+ D, -+ x, -+ num_iterations, -+ progress_interval, -+ progress_cb, -+ progress_user_data); -+ -+ // Run the fast repeated-squaring engine to `num_iterations`. -+ // The callback stores all intermediates needed for the proof. -+ FastStorage* fast_storage = nullptr; -+ repeated_square(num_iterations, x, D, L, &weso, fast_storage, stopped); -+ -+ // Now generate the compact proof from the stored intermediates. -+ Proof proof = ProveOneWesolowski(num_iterations, D, x, &weso, stopped); -+ if (proof.y.empty() || proof.proof.empty()) { -+ return empty_result(); -+ } -+ -+ const size_t total = proof.y.size() + proof.proof.size(); -+ uint8_t* out = new uint8_t[total]; -+ std::copy(proof.y.begin(), proof.y.end(), out); -+ std::copy(proof.proof.begin(), proof.proof.end(), out + proof.y.size()); -+ return ChiavdfByteArray{out, total}; -+ } catch (...) { -+ return empty_result(); -+ } -+} -+ -+extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming( -+ const uint8_t* challenge_hash, -+ size_t challenge_size, -+ const uint8_t* x_s, -+ size_t x_s_size, -+ const uint8_t* y_ref_s, -+ size_t y_ref_s_size, -+ size_t discriminant_size_bits, -+ uint64_t num_iterations) { -+ return chiavdf_prove_one_weso_fast_streaming_with_progress( -+ challenge_hash, -+ challenge_size, -+ x_s, -+ x_s_size, -+ y_ref_s, -+ y_ref_s_size, -+ discriminant_size_bits, -+ num_iterations, -+ /*progress_interval=*/0, -+ /*progress_cb=*/nullptr, -+ /*progress_user_data=*/nullptr); -+} -+ -+extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_with_progress( -+ const uint8_t* challenge_hash, -+ size_t challenge_size, -+ const uint8_t* x_s, -+ size_t x_s_size, -+ const uint8_t* y_ref_s, -+ size_t y_ref_s_size, -+ size_t discriminant_size_bits, -+ uint64_t num_iterations, -+ uint64_t progress_interval, -+ ChiavdfProgressCallback progress_cb, -+ void* progress_user_data) { -+ try { -+ return chiavdf_prove_one_weso_fast_streaming_impl( -+ challenge_hash, -+ challenge_size, -+ x_s, -+ x_s_size, -+ y_ref_s, -+ y_ref_s_size, -+ discriminant_size_bits, -+ num_iterations, -+ progress_interval, -+ progress_cb, -+ progress_user_data, -+ /*use_getblock_opt=*/false); -+ } catch (...) { -+ return empty_result(); -+ } -+} -+ -+extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt( -+ const uint8_t* challenge_hash, -+ size_t challenge_size, -+ const uint8_t* x_s, -+ size_t x_s_size, -+ const uint8_t* y_ref_s, -+ size_t y_ref_s_size, -+ size_t discriminant_size_bits, -+ uint64_t num_iterations) { -+ return chiavdf_prove_one_weso_fast_streaming_getblock_opt_with_progress( -+ challenge_hash, -+ challenge_size, -+ x_s, -+ x_s_size, -+ y_ref_s, -+ y_ref_s_size, -+ discriminant_size_bits, -+ num_iterations, -+ /*progress_interval=*/0, -+ /*progress_cb=*/nullptr, -+ /*progress_user_data=*/nullptr); -+} -+ -+extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt_with_progress( -+ const uint8_t* challenge_hash, -+ size_t challenge_size, -+ const uint8_t* x_s, -+ size_t x_s_size, -+ const uint8_t* y_ref_s, -+ size_t y_ref_s_size, -+ size_t discriminant_size_bits, -+ uint64_t num_iterations, -+ uint64_t progress_interval, -+ ChiavdfProgressCallback progress_cb, -+ void* progress_user_data) { -+ try { -+ return chiavdf_prove_one_weso_fast_streaming_impl( -+ challenge_hash, -+ challenge_size, -+ x_s, -+ x_s_size, -+ y_ref_s, -+ y_ref_s_size, -+ discriminant_size_bits, -+ num_iterations, -+ progress_interval, -+ progress_cb, -+ progress_user_data, -+ /*use_getblock_opt=*/true); -+ } catch (...) { -+ return empty_result(); -+ } -+} -+ -+extern "C" void chiavdf_set_bucket_memory_budget_bytes(uint64_t bytes) { -+ bucket_memory_budget_bytes.store(bytes, std::memory_order_relaxed); -+} -+ -+extern "C" void chiavdf_set_enable_streaming_stats(bool enable) { -+ streaming_stats_enabled.store(enable, std::memory_order_relaxed); -+ last_streaming_stats = LastStreamingStats{}; -+} -+ -+extern "C" bool chiavdf_get_last_streaming_parameters(uint32_t* out_k, uint32_t* out_l, bool* out_tuned) { -+ if (out_k == nullptr || out_l == nullptr || out_tuned == nullptr) { -+ return false; -+ } -+ if (!last_streaming_parameters.set) { -+ return false; -+ } -+ *out_k = last_streaming_parameters.k; -+ *out_l = last_streaming_parameters.l; -+ *out_tuned = last_streaming_parameters.tuned; -+ return true; -+} -+ -+extern "C" bool chiavdf_get_last_streaming_stats( -+ uint64_t* out_checkpoint_total_ns, -+ uint64_t* out_checkpoint_event_total_ns, -+ uint64_t* out_finalize_total_ns, -+ uint64_t* out_checkpoint_calls, -+ uint64_t* out_bucket_updates) { -+ if (out_checkpoint_total_ns == nullptr || out_checkpoint_event_total_ns == nullptr || -+ out_finalize_total_ns == nullptr || out_checkpoint_calls == nullptr || -+ out_bucket_updates == nullptr) { -+ return false; -+ } -+ if (!last_streaming_stats.set) { -+ return false; -+ } -+ *out_checkpoint_total_ns = last_streaming_stats.checkpoint_total_ns; -+ *out_checkpoint_event_total_ns = last_streaming_stats.checkpoint_event_total_ns; -+ *out_finalize_total_ns = last_streaming_stats.finalize_total_ns; -+ *out_checkpoint_calls = last_streaming_stats.checkpoint_calls; -+ *out_bucket_updates = last_streaming_stats.bucket_updates; -+ return true; -+} -+ -+extern "C" void chiavdf_free_byte_array(ChiavdfByteArray array) { delete[] array.data; } -diff --git a/src/c_bindings/fast_wrapper.h b/src/c_bindings/fast_wrapper.h -new file mode 100644 -index 0000000..bf33f32 ---- /dev/null -+++ b/src/c_bindings/fast_wrapper.h -@@ -0,0 +1,145 @@ -+#pragma once -+ -+#include -+#include -+#include -+ -+#ifdef __cplusplus -+extern "C" { -+#endif -+ -+typedef struct { -+ uint8_t* data; -+ size_t length; -+} ChiavdfByteArray; -+ -+typedef void (*ChiavdfProgressCallback)(uint64_t iters_done, void* user_data); -+ -+// Configure the per-process memory budget used by the parameter tuner when -+// selecting `(k,l)` for streaming/bucket-based proving. -+// -+// The budget is per worker process (not global across multiple processes). -+// -+// If `bytes` is 0, the default chiavdf heuristic is used. -+void chiavdf_set_bucket_memory_budget_bytes(uint64_t bytes); -+ -+// Debug helper: returns the `(k,l)` parameters selected for the most recent -+// streaming proof computed on the current thread. -+// -+// Returns true if parameters are available. -+bool chiavdf_get_last_streaming_parameters(uint32_t* out_k, uint32_t* out_l, bool* out_tuned); -+ -+// Enable lightweight timing counters for the streaming prover. -+// -+// When enabled, the native library records basic timing counters for the most -+// recent streaming proof computed on the current thread. This is intended for -+// benchmarking and tuning; production runs should keep this disabled to avoid -+// extra overhead. -+void chiavdf_set_enable_streaming_stats(bool enable); -+ -+// Debug helper: returns timing counters for the most recent streaming proof on -+// the current thread. -+// -+// Returns true if stats are available (i.e. stats enabled and a streaming proof -+// was computed successfully). -+bool chiavdf_get_last_streaming_stats( -+ uint64_t* out_checkpoint_total_ns, -+ uint64_t* out_checkpoint_event_total_ns, -+ uint64_t* out_finalize_total_ns, -+ uint64_t* out_checkpoint_calls, -+ uint64_t* out_bucket_updates); -+ -+// Computes a compact (witness_type=0) Wesolowski proof using the fast engine. -+// -+// On success, returns `y || proof` where: -+// - `y` is the serialized output form (typically 100 bytes for 1024-bit discriminants) -+// - `proof` is the serialized witness form (same size as `y`) -+// -+// On failure, returns `{NULL, 0}`. -+ChiavdfByteArray chiavdf_prove_one_weso_fast( -+ const uint8_t* challenge_hash, -+ size_t challenge_size, -+ const uint8_t* x_s, -+ size_t x_s_size, -+ size_t discriminant_size_bits, -+ uint64_t num_iterations); -+ -+// Same as `chiavdf_prove_one_weso_fast`, but optionally invokes `progress_cb` from -+// the proving thread every `progress_interval` iterations completed. -+// -+// If `progress_cb` is NULL or `progress_interval` is 0, no progress is reported. -+ChiavdfByteArray chiavdf_prove_one_weso_fast_with_progress( -+ const uint8_t* challenge_hash, -+ size_t challenge_size, -+ const uint8_t* x_s, -+ size_t x_s_size, -+ size_t discriminant_size_bits, -+ uint64_t num_iterations, -+ uint64_t progress_interval, -+ ChiavdfProgressCallback progress_cb, -+ void* progress_user_data); -+ -+// Computes a compact (witness_type=0) Wesolowski proof using the "streaming" -+// bucket-accumulation algorithm (Trick 1), which requires the expected output -+// `y_ref` up front (as used by bluebox compaction jobs). -+// -+// On success, returns `y || proof` (same format as `chiavdf_prove_one_weso_fast`). -+ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming( -+ const uint8_t* challenge_hash, -+ size_t challenge_size, -+ const uint8_t* x_s, -+ size_t x_s_size, -+ const uint8_t* y_ref_s, -+ size_t y_ref_s_size, -+ size_t discriminant_size_bits, -+ uint64_t num_iterations); -+ -+// Same as `chiavdf_prove_one_weso_fast_streaming`, but optionally invokes -+// `progress_cb` from the proving thread every `progress_interval` iterations. -+ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_with_progress( -+ const uint8_t* challenge_hash, -+ size_t challenge_size, -+ const uint8_t* x_s, -+ size_t x_s_size, -+ const uint8_t* y_ref_s, -+ size_t y_ref_s_size, -+ size_t discriminant_size_bits, -+ uint64_t num_iterations, -+ uint64_t progress_interval, -+ ChiavdfProgressCallback progress_cb, -+ void* progress_user_data); -+ -+// Same as `chiavdf_prove_one_weso_fast_streaming`, but with an optimized -+// implementation of the `GetBlock()` mapping (avoids per-block modular -+// exponentiation without allocating a full `GetBlock` table). -+ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt( -+ const uint8_t* challenge_hash, -+ size_t challenge_size, -+ const uint8_t* x_s, -+ size_t x_s_size, -+ const uint8_t* y_ref_s, -+ size_t y_ref_s_size, -+ size_t discriminant_size_bits, -+ uint64_t num_iterations); -+ -+// Same as `chiavdf_prove_one_weso_fast_streaming_getblock_opt`, but optionally -+// invokes `progress_cb` from the proving thread every `progress_interval` -+// iterations. -+ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt_with_progress( -+ const uint8_t* challenge_hash, -+ size_t challenge_size, -+ const uint8_t* x_s, -+ size_t x_s_size, -+ const uint8_t* y_ref_s, -+ size_t y_ref_s_size, -+ size_t discriminant_size_bits, -+ uint64_t num_iterations, -+ uint64_t progress_interval, -+ ChiavdfProgressCallback progress_cb, -+ void* progress_user_data); -+ -+void chiavdf_free_byte_array(ChiavdfByteArray array); -+ -+#ifdef __cplusplus -+} -+#endif -diff --git a/src/threading.h b/src/threading.h -index 50d4b49..f6344ad 100644 ---- a/src/threading.h -+++ b/src/threading.h -@@ -564,8 +564,8 @@ struct alignas(64) thread_counter { - } - }; - --thread_counter master_counter[100]; --thread_counter slave_counter[100]; -+thread_counter master_counter[512]; -+thread_counter slave_counter[512]; - - struct thread_state { - int pairindex; -diff --git a/src/vdf.h b/src/vdf.h -index 9ab4aef..4544fe2 100644 ---- a/src/vdf.h -+++ b/src/vdf.h -@@ -78,6 +78,18 @@ std::mutex new_event_mutex, cout_lock; - bool debug_mode = false; - bool fast_algorithm = false; - bool two_weso = false; -+bool quiet_mode = false; -+ -+// vdf_fast uses shared master/slave counters keyed by `square_state.pairindex`. -+// The upstream chiavdf binaries run one VDF per process and hardcode `pairindex=0`. -+// In embedded/multi-worker setups (like WesoForge), multiple VDF computations can -+// run concurrently in the same process; they must not share a pairindex. -+inline int vdf_fast_pairindex() { -+ constexpr int kSlots = int(sizeof(master_counter) / sizeof(master_counter[0])); -+ static std::atomic next_slot{0}; -+ thread_local int slot = next_slot.fetch_add(1, std::memory_order_relaxed) % kSlots; -+ return slot; -+} - - //always works - void repeated_square_original(vdf_original &vdfo, form& f, const integer& D, const integer& L, uint64 base, uint64 iterations, INUDUPLListener *nuduplListener) { -@@ -137,7 +149,7 @@ void repeated_square(uint64_t iterations, form f, const integer& D, const intege - - // This works single threaded - square_state_type square_state; -- square_state.pairindex=0; -+ square_state.pairindex=vdf_fast_pairindex(); - - uint64 actual_iterations=repeated_square_fast(square_state, f, D, L, num_iterations, batch_size, weso); - -@@ -236,10 +248,12 @@ void repeated_square(uint64_t iterations, form f, const integer& D, const intege - } - #endif - } -- { -- // this shouldn't be needed but avoids some false positive in TSAN -- std::lock_guard lk(cout_lock); -- std::cout << "VDF loop finished. Total iters: " << num_iterations << "\n" << std::flush; -+ if (!quiet_mode) { -+ { -+ // this shouldn't be needed but avoids some false positive in TSAN -+ std::lock_guard lk(cout_lock); -+ std::cout << "VDF loop finished. Total iters: " << num_iterations << "\n" << std::flush; -+ } - } - - #ifdef VDF_TEST -@@ -275,11 +289,6 @@ Proof ProveOneWesolowski(uint64_t iters, integer& D, form f, OneWesolowskiCallba - proof_serialized = SerializeForm(proof_form, d_bits); - Proof proof(y_serialized, proof_serialized); - proof.witness_type = 0; -- { -- // this shouldn't be needed but avoids some false positive in TSAN -- std::lock_guard lk(cout_lock); -- std::cout << "Got simple weso proof: " << proof.hex() << "\n"; -- } - return proof; - } - -diff --git a/docs/bluebox_compaction.md b/docs/bluebox_compaction.md -new file mode 100644 -index 0000000..61cd1fd ---- /dev/null -+++ b/docs/bluebox_compaction.md -@@ -0,0 +1,49 @@ -+# Bluebox Compaction Optimizations -+ -+This document describes the compaction-oriented proving path exposed by -+`src/c_bindings/fast_wrapper.h` and implemented in -+`src/c_bindings/fast_wrapper.cpp`. -+ -+## Scope -+ -+These APIs are intended for workloads where the expected VDF output (`y_ref`) is -+already known up front (for example, bluebox compaction jobs). They are additive -+and do not change the existing `c_wrapper` APIs. -+ -+## Optimization 1: Streaming one-wesolowski -+ -+Given `y_ref`, the prover computes: -+ -+- `B = GetB(D, x, y_ref)` before squaring starts -+ -+This enables a streaming algorithm that updates proof buckets at each -+checkpoint during repeated squaring, instead of materializing the full -+intermediate checkpoint array and scanning it after the loop. In practice this -+substantially reduces memory usage for compaction workloads. -+ -+## Optimization 2: Incremental GetBlock mapping -+ -+For streaming checkpoint updates, bucket index selection repeatedly calls -+`GetBlock(p, k, T, B)`. The optimized mode keeps a rolling modular state and -+advances sequential `p` values incrementally, avoiding full modular -+exponentiation per call and avoiding a large lookup table. -+ -+## Optimization 3: Memory-budgeted (k, l) tuning -+ -+The wrapper can tune `(k, l)` under a configured memory budget: -+ -+- `chiavdf_set_bucket_memory_budget_bytes(...)` -+ -+If no tuned candidate is found, the code falls back to the standard parameter -+heuristics. -+ -+## Operational Notes -+ -+- The `fast_wrapper` code path sets one-wesolowski mode and uses `quiet_mode` to -+ avoid unsolicited stdout noise when embedded in multi-worker clients. -+- Thread-slot assignment for the fast VDF counters is per-thread via -+ `vdf_fast_pairindex()`, avoiding slot collisions when multiple VDF computations -+ run in one process. -+- The production default for `enable_threads` in `parameters.h` is unchanged from -+ upstream to preserve timelord expectations. -+ diff --git a/src/c_bindings/fast_wrapper.cpp b/src/c_bindings/fast_wrapper.cpp index af3bf805..ae834c84 100644 --- a/src/c_bindings/fast_wrapper.cpp +++ b/src/c_bindings/fast_wrapper.cpp @@ -169,11 +169,12 @@ bool tune_streaming_parameters( continue; } - unsigned __int128 updates = static_cast( - (num_iterations + static_cast(k) - 1) / static_cast(k)); uint64_t kl = static_cast(k) * static_cast(l); unsigned __int128 checkpoints = static_cast( (num_iterations + kl - 1) / kl); + // Each checkpoint can trigger up to `l` bucket updates (one per sub-block). + // Model update work as checkpoint-count scaled by `l`. + unsigned __int128 updates = checkpoints * static_cast(l); unsigned __int128 fold = static_cast(l) << (k + 1); unsigned __int128 cost = updates * update_weight + checkpoints * checkpoint_weight + fold * fold_weight; From 61e9280ead4d4cd19caf8f0984b4ab562730ad6a Mon Sep 17 00:00:00 2001 From: Gene Hoffman Date: Tue, 12 May 2026 20:08:54 -0700 Subject: [PATCH 12/13] Adapt streaming callback checkpoint scheduling from fb0e2c2. Replace per-iteration modulo checks with next-checkpoint tracking in the streaming callback, and integrate the scheduling update with batch replay boundaries so rollback/replay semantics remain correct in the current upstreamed implementation. Co-authored-by: Cursor --- src/c_bindings/fast_wrapper.cpp | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/src/c_bindings/fast_wrapper.cpp b/src/c_bindings/fast_wrapper.cpp index ae834c84..c3351e50 100644 --- a/src/c_bindings/fast_wrapper.cpp +++ b/src/c_bindings/fast_wrapper.cpp @@ -295,6 +295,7 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback { progress_cb(progress_cb), progress_user_data(progress_user_data), next_progress(progress_interval), + next_checkpoint_t((limit <= 1 || kl == 0) ? std::numeric_limits::max() : kl), use_getblock_opt(use_getblock_opt), stats_enabled(streaming_stats_enabled.load(std::memory_order_relaxed)) { form id = form::identity(D); @@ -328,7 +329,7 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback { next_progress += progress_interval; } - if (iteration % kl == 0) { + if (iteration == next_checkpoint_t) { uint64_t pos = iteration / kl; if (pos < limit) { form checkpoint; @@ -348,6 +349,14 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback { .count()); } } + + const uint64_t next_pos = pos + 1; + if (next_pos < limit && kl != 0 && + next_pos <= std::numeric_limits::max() / kl) { + next_checkpoint_t = next_pos * kl; + } else { + next_checkpoint_t = std::numeric_limits::max(); + } } if (iteration == wanted_iter) { @@ -361,6 +370,7 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback { if (batch_size == 0) { batch_start_iteration = 1; batch_end_iteration = 0; + next_checkpoint_t = std::numeric_limits::max(); return; } // `base_iteration` is the number of completed iterations before this batch. @@ -371,6 +381,24 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback { } else { batch_end_iteration = base_iteration + batch_size; } + + if (kl == 0 || limit <= 1) { + next_checkpoint_t = std::numeric_limits::max(); + return; + } + + const uint64_t first_iteration = saturating_add_u64(base_iteration, 1); + const uint64_t numerator = saturating_add_u64(first_iteration, kl - 1); + uint64_t first_pos = numerator / kl; + if (first_pos == 0) { + first_pos = 1; + } + + if (first_pos < limit && first_pos <= std::numeric_limits::max() / kl) { + next_checkpoint_t = first_pos * kl; + } else { + next_checkpoint_t = std::numeric_limits::max(); + } } void OnBatchReplay(uint64_t base_iteration, uint64_t batch_size) override { @@ -524,6 +552,7 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback { ChiavdfProgressCallback progress_cb; void* progress_user_data; uint64_t next_progress; + uint64_t next_checkpoint_t = std::numeric_limits::max(); size_t bucket_span = 0; std::vector buckets; From 91a2af96cd83525c20badf67a53727daccf657db Mon Sep 17 00:00:00 2001 From: Gene Hoffman Date: Tue, 12 May 2026 20:35:42 -0700 Subject: [PATCH 13/13] Address slot reuse and logging consistency in vdf fast path. Lease fast counter slots with per-slot in-use tracking so long-lived processes can recycle released slots safely, and restore the one-weso proof diagnostic behind quiet_mode to keep client logging behavior consistent. Co-authored-by: Cursor --- src/vdf.h | 49 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 4 deletions(-) diff --git a/src/vdf.h b/src/vdf.h index eb1d0d39..92a56b78 100644 --- a/src/vdf.h +++ b/src/vdf.h @@ -94,8 +94,6 @@ bool quiet_mode = false; // In embedded/multi-worker setups (like WesoForge), multiple VDF computations can // run concurrently in the same process; they must not share a pairindex. #if (defined(ARCH_X86) || defined(ARCH_X64)) && !defined(CHIA_DISABLE_ASM) -// Keep slot allocation state as one program-wide entity for all TUs that include -// this header, so concurrent callers cannot recycle the same slot sequence. inline std::atomic vdf_fast_next_slot{0}; #endif @@ -103,8 +101,45 @@ inline int vdf_fast_pairindex() { #if (defined(ARCH_X86) || defined(ARCH_X64)) && !defined(CHIA_DISABLE_ASM) constexpr unsigned int kSlots = unsigned(sizeof(master_counter) / sizeof(master_counter[0])); static_assert(kSlots > 0, "CHIA_VDF_FAST_COUNTER_SLOTS must be > 0"); - thread_local int slot = int(vdf_fast_next_slot.fetch_add(1u, std::memory_order_relaxed) % kSlots); - return slot; + static std::array, kSlots> vdf_fast_slot_in_use{}; + struct SlotLease { + std::array, kSlots>* slots = nullptr; + int slot = -1; + bool owns_slot = false; + ~SlotLease() { + if (owns_slot && slots != nullptr && slot >= 0) { + (*slots)[static_cast(slot)].store(false, std::memory_order_release); + } + } + }; + + thread_local SlotLease lease; + if (lease.slot >= 0) { + return lease.slot; + } + + lease.slots = &vdf_fast_slot_in_use; + + const unsigned int start = vdf_fast_next_slot.fetch_add(1u, std::memory_order_relaxed); + for (unsigned int i = 0; i < kSlots; i++) { + const unsigned int candidate = (start + i) % kSlots; + bool expected = false; + if (vdf_fast_slot_in_use[candidate].compare_exchange_strong( + expected, + true, + std::memory_order_acq_rel, + std::memory_order_relaxed)) { + lease.slot = static_cast(candidate); + lease.owns_slot = true; + return lease.slot; + } + } + + // All slots are currently active. Reuse one as a best-effort fallback; the + // fast path has corruption detection and can fall back to slow squaring. + lease.slot = static_cast(start % kSlots); + lease.owns_slot = false; + return lease.slot; #else return 0; #endif @@ -367,6 +402,12 @@ Proof ProveOneWesolowski(uint64_t iters, integer& D, form f, OneWesolowskiCallba proof_serialized = SerializeForm(proof_form, d_bits); Proof proof(y_serialized, proof_serialized); proof.witness_type = 0; + if (!quiet_mode) { + // Keep proof diagnostics available for vdf_client while quiet_mode + // suppresses output in embedded library-mode call paths. + std::lock_guard lk(cout_lock); + std::cout << "Got simple weso proof: " << proof.hex() << "\n"; + } return proof; }