From 7847c67b743c1f1f6c9cf218c57e8163f61816be Mon Sep 17 00:00:00 2001
From: Gene Hoffman <hoffmang@hoffmang.com>
Date: Mon, 23 Feb 2026 23:30:52 -0800
Subject: [PATCH 01/21] Add streaming one-wesolowski compaction APIs.

Introduce a fast C wrapper with streaming proof generation, incremental GetBlock optimization, and memory-budgeted (k,l) tuning, plus the minimal runtime/build infrastructure needed to embed chiavdf in multi-worker clients.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 docs/bluebox_compaction.md      |  49 ++
 src/Makefile.vdf-client         |  39 +-
 src/c_bindings/fast_wrapper.cpp | 795 ++++++++++++++++++++++++++++++++
 src/c_bindings/fast_wrapper.h   | 145 ++++++
 src/threading.h                 |   4 +-
 src/vdf.h                       |  29 +-
 6 files changed, 1044 insertions(+), 17 deletions(-)
 create mode 100644 docs/bluebox_compaction.md
 create mode 100644 src/c_bindings/fast_wrapper.cpp
 create mode 100644 src/c_bindings/fast_wrapper.h

diff --git a/docs/bluebox_compaction.md b/docs/bluebox_compaction.md
new file mode 100644
index 00000000..61cd1fd4
--- /dev/null
+++ b/docs/bluebox_compaction.md
@@ -0,0 +1,49 @@
+# Bluebox Compaction Optimizations
+
+This document describes the compaction-oriented proving path exposed by
+`src/c_bindings/fast_wrapper.h` and implemented in
+`src/c_bindings/fast_wrapper.cpp`.
+
+## Scope
+
+These APIs are intended for workloads where the expected VDF output (`y_ref`) is
+already known up front (for example, bluebox compaction jobs). They are additive
+and do not change the existing `c_wrapper` APIs.
+
+## Optimization 1: Streaming one-wesolowski
+
+Given `y_ref`, the prover computes:
+
+- `B = GetB(D, x, y_ref)` before squaring starts
+
+This enables a streaming algorithm that updates proof buckets at each
+checkpoint during repeated squaring, instead of materializing the full
+intermediate checkpoint array and scanning it after the loop. In practice this
+substantially reduces memory usage for compaction workloads.
+
+## Optimization 2: Incremental GetBlock mapping
+
+For streaming checkpoint updates, bucket index selection repeatedly calls
+`GetBlock(p, k, T, B)`. The optimized mode keeps a rolling modular state and
+advances sequential `p` values incrementally, avoiding full modular
+exponentiation per call and avoiding a large lookup table.
+
+## Optimization 3: Memory-budgeted (k, l) tuning
+
+The wrapper can tune `(k, l)` under a configured memory budget:
+
+- `chiavdf_set_bucket_memory_budget_bytes(...)`
+
+If no tuned candidate is found, the code falls back to the standard parameter
+heuristics.
+
+## Operational Notes
+
+- The `fast_wrapper` code path sets one-wesolowski mode and uses `quiet_mode` to
+  avoid unsolicited stdout noise when embedded in multi-worker clients.
+- Thread-slot assignment for the fast VDF counters is per-thread via
+  `vdf_fast_pairindex()`, avoiding slot collisions when multiple VDF computations
+  run in one process.
+- The production default for `enable_threads` in `parameters.h` is unchanged from
+  upstream to preserve timelord expectations.
+
diff --git a/src/Makefile.vdf-client b/src/Makefile.vdf-client
index 59fcbb63..0fe2380a 100644
--- a/src/Makefile.vdf-client
+++ b/src/Makefile.vdf-client
@@ -26,15 +26,26 @@ ifeq ($(UNAME),Darwin)
 NOPIE =
 endif
 
-CFLAGS += $(LTO_FLAGS) $(NOPIE)
-LDFLAGS += $(LTO_FLAGS) $(NOPIE) -g
+# Optional: set `PIC=1` to build position-independent objects.
+PIC ?= 0
+ifeq ($(PIC),1)
+PICFLAGS = -fPIC
+PIEFLAGS =
+else
+PICFLAGS =
+PIEFLAGS = $(NOPIE)
+endif
+
+CFLAGS += $(LTO_FLAGS) $(PIEFLAGS) $(PICFLAGS)
+LDFLAGS += $(LTO_FLAGS) $(PIEFLAGS) -g
 ifeq ($(OS),Windows_NT)
 LDLIBS += -lmpirxx -lmpir -lws2_32
-CXXFLAGS += $(LTO_FLAGS) -std=c++1z -D VDF_MODE=0 -D FAST_MACHINE=1 $(NOPIE) -fvisibility=hidden
+CXXFLAGS += $(LTO_FLAGS) -std=c++1z -D VDF_MODE=0 -D FAST_MACHINE=1 $(PIEFLAGS) $(PICFLAGS) -fvisibility=hidden
 else
 LDLIBS += -lgmpxx -lgmp -pthread
-CXXFLAGS += $(LTO_FLAGS) -std=c++1z -D VDF_MODE=0 -D FAST_MACHINE=1 -pthread $(NOPIE) -fvisibility=hidden
+CXXFLAGS += $(LTO_FLAGS) -std=c++1z -D VDF_MODE=0 -D FAST_MACHINE=1 -pthread $(PIEFLAGS) $(PICFLAGS) -fvisibility=hidden
 endif
+ASFLAGS += $(PICFLAGS)
 ifeq ($(UNAME),Darwin)
 CXXFLAGS += -D CHIAOSX=1
 # Homebrew (common on macOS) installs boost/gmp to /opt/homebrew or /usr/local
@@ -81,7 +92,7 @@ BINS = vdf_client prover_test 1weso_test 2weso_test vdf_bench
 all: $(BINS)
 
 clean:
-	rm -f *.o hw/*.o $(BINS) compile_asm emu_hw_test hw_test hw_vdf_client emu_hw_vdf_client
+	rm -f *.o hw/*.o c_bindings/*.o $(BINS) compile_asm emu_hw_test hw_test hw_vdf_client emu_hw_vdf_client libchiavdf_fastc.a
 
 $(BINS) avx512_test: %: %.o lzcnt.o $(ASM_OBJS)
 	$(CXX) $(LDFLAGS) -o $@ $^ $(LDLIBS)
@@ -91,6 +102,9 @@ $(addsuffix .o,$(BINS)) avx512_test.o: CXXFLAGS += $(OPT_CFLAGS)
 lzcnt.o: refcode/lzcnt.c
 	$(CC) $(CFLAGS) -c refcode/lzcnt.c
 
+%.o: %.s
+	$(CC) -c $< -o $@ $(ASFLAGS)
+
 asm_compiled.s: compile_asm
 	./compile_asm
 
@@ -104,6 +118,21 @@ compile_asm: compile_asm.o
 	$(CXX) $(LDFLAGS) -o $@ $^ $(LDLIBS)
 
 HW_OBJS = $(addprefix hw/,hw_util.o hw_proof.o hw_interface.o chia_driver.o ftdi_driver.o vdf_driver.o pll_freqs.o) vdf_base_hw.o vdf_hw_symbol_anchors.o prover_runtime.o lzcnt.o
+# ---------------------------------------------------------------------------
+# Static library: fast one-wesolowski proof (BBR integration)
+# ---------------------------------------------------------------------------
+
+FASTLIB = libchiavdf_fastc.a
+FASTLIB_OBJS = c_bindings/fast_wrapper.o lzcnt.o $(ASM_OBJS)
+
+.PHONY: fastlib
+
+fastlib: $(FASTLIB)
+
+$(FASTLIB): $(FASTLIB_OBJS)
+	$(AR) rcs $@ $^
+
+c_bindings/fast_wrapper.o: CXXFLAGS += $(OPT_CFLAGS)
 EMU_OBJS = hw/emu_funcs.o hw/emu_runner.o
 ifeq ($(OS),Windows_NT)
 HW_LIB = hw/libft4222/libft4222.lib
diff --git a/src/c_bindings/fast_wrapper.cpp b/src/c_bindings/fast_wrapper.cpp
new file mode 100644
index 00000000..198d0a87
--- /dev/null
+++ b/src/c_bindings/fast_wrapper.cpp
@@ -0,0 +1,795 @@
+#include "fast_wrapper.h"
+
+#include <atomic>
+#include <chrono>
+#include <limits>
+#include <mutex>
+#include <vector>
+
+#include "../vdf.h"
+#include "../create_discriminant.h"
+
+// Runtime configuration knobs required by `parameters.h`.
+// These are `extern` variables there, but each binary defines them explicitly.
+bool use_divide_table = false;
+int gcd_base_bits = 50;
+int gcd_128_max_iter = 3;
+std::string asmprefix = "cel_";
+bool enable_all_instructions = false;
+
+namespace {
+std::once_flag init_once;
+std::atomic<uint64_t> bucket_memory_budget_bytes(128ULL * 1024ULL * 1024ULL);
+std::atomic<bool> streaming_stats_enabled(false);
+
+struct LastStreamingParameters {
+    uint32_t k = 0;
+    uint32_t l = 0;
+    bool tuned = false;
+    bool set = false;
+};
+
+thread_local LastStreamingParameters last_streaming_parameters;
+
+struct LastStreamingStats {
+    uint64_t checkpoint_total_ns = 0;
+    uint64_t checkpoint_event_total_ns = 0;
+    uint64_t finalize_total_ns = 0;
+    uint64_t checkpoint_calls = 0;
+    uint64_t bucket_updates = 0;
+    bool set = false;
+};
+
+thread_local LastStreamingStats last_streaming_stats;
+
+void init_chiavdf_fast() {
+    init_gmp();
+    set_rounding_mode();
+
+    // Match the vdf_client runtime selection for AVX2.
+    if (hasAVX2()) {
+        gcd_base_bits = 63;
+        gcd_128_max_iter = 2;
+    } else {
+        gcd_base_bits = 50;
+        gcd_128_max_iter = 3;
+    }
+
+    // Ensure we run the one-wesolowski path by default.
+    fast_algorithm = false;
+    two_weso = false;
+    quiet_mode = true;
+}
+
+ChiavdfByteArray empty_result() { return ChiavdfByteArray{nullptr, 0}; }
+
+uint64_t estimate_bucket_form_bytes(size_t discriminant_size_bits) {
+    // Be conservative: class group forms contain 3 GMP-backed integers that
+    // quickly grow to the discriminant size (or beyond) during NUCOMP.
+    //
+    // This estimate is intentionally larger than the raw serialized size to
+    // avoid picking parameters that risk paging/OOM.
+    uint64_t discr_bytes = (static_cast<uint64_t>(discriminant_size_bits) + 7) / 8;
+    uint64_t estimate = discr_bytes * 16;
+    if (estimate < 2048) {
+        estimate = 2048;
+    }
+    return estimate;
+}
+
+bool tune_streaming_parameters(
+    uint64_t num_iterations,
+    size_t discriminant_size_bits,
+    uint64_t memory_budget_bytes,
+    uint32_t& out_l,
+    uint32_t& out_k) {
+    if (memory_budget_bytes == 0) {
+        return false;
+    }
+
+    // Keep headroom for GMP scratch allocations and general process overhead.
+    uint64_t budget = (memory_budget_bytes * 80) / 100;
+    uint64_t bytes_per_form = estimate_bucket_form_bytes(discriminant_size_bits);
+    if (budget < bytes_per_form) {
+        return false;
+    }
+
+    unsigned __int128 best_cost = std::numeric_limits<unsigned __int128>::max();
+    bool found = false;
+
+    // Empirical tuning notes (1024-bit discriminants, AVX2 build):
+    // - Each bucket update (NUCOMP) and each fold unit is ~5µs.
+    // - Per-checkpoint event overhead (SetForm + bookkeeping) is ~0.3µs.
+    //
+    // So checkpoint counts should be weighted much lower than updates/fold.
+    constexpr unsigned __int128 update_weight = 16;
+    constexpr unsigned __int128 fold_weight = 16;
+    constexpr unsigned __int128 checkpoint_weight = 1;
+
+    // Search a small grid of `(k,l)` values. Higher `k` reduces checkpoint work
+    // (~T/k) but increases fold work (~l·2^k) and bucket memory (~l·2^k).
+    for (uint32_t k = 4; k <= 20; k++) {
+        unsigned __int128 buckets_per_row = static_cast<unsigned __int128>(1) << k;
+
+        for (uint32_t l = 1; l <= 64; l++) {
+            unsigned __int128 form_count = buckets_per_row * static_cast<unsigned __int128>(l);
+            unsigned __int128 mem_required =
+                form_count * static_cast<unsigned __int128>(bytes_per_form);
+            if (mem_required > static_cast<unsigned __int128>(budget)) {
+                continue;
+            }
+
+            unsigned __int128 updates = static_cast<unsigned __int128>(
+                (num_iterations + static_cast<uint64_t>(k) - 1) / static_cast<uint64_t>(k));
+            uint64_t kl = static_cast<uint64_t>(k) * static_cast<uint64_t>(l);
+            unsigned __int128 checkpoints = static_cast<unsigned __int128>(
+                (num_iterations + kl - 1) / kl);
+            unsigned __int128 fold = static_cast<unsigned __int128>(l) << (k + 1);
+            unsigned __int128 cost =
+                updates * update_weight + checkpoints * checkpoint_weight + fold * fold_weight;
+
+            if (!found || cost < best_cost) {
+                found = true;
+                best_cost = cost;
+                out_k = k;
+                out_l = l;
+            }
+        }
+    }
+
+    return found;
+}
+
+uint64_t get_block(uint64_t i, uint64_t k, uint64_t T, integer& B) {
+    integer res = FastPow(2, T - k * (i + 1), B);
+    mpz_mul_2exp(res.impl, res.impl, k);
+    res = res / B;
+    auto res_vector = res.to_vector();
+    return res_vector.empty() ? 0 : res_vector[0];
+}
+
+class ProgressOneWesolowskiCallback final : public OneWesolowskiCallback {
+  public:
+    ProgressOneWesolowskiCallback(
+        integer& D,
+        form& f,
+        uint64_t wanted_iter,
+        uint64_t progress_interval,
+        ChiavdfProgressCallback progress_cb,
+        void* progress_user_data)
+        : OneWesolowskiCallback(D, f, wanted_iter),
+          progress_interval(progress_interval),
+          progress_cb(progress_cb),
+          progress_user_data(progress_user_data),
+          next_progress(progress_interval) {}
+
+    void OnIteration(int type, void* data, uint64_t iteration) override {
+        OneWesolowskiCallback::OnIteration(type, data, iteration);
+
+        if (progress_cb == nullptr || progress_interval == 0) {
+            return;
+        }
+
+        uint64_t done = iteration + 1;
+        if (done > wanted_iter) {
+            return;
+        }
+
+        if (done >= next_progress) {
+            progress_cb(next_progress, progress_user_data);
+            next_progress += progress_interval;
+        }
+    }
+
+  private:
+    uint64_t progress_interval;
+    ChiavdfProgressCallback progress_cb;
+    void* progress_user_data;
+    uint64_t next_progress;
+};
+
+class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
+  public:
+    StreamingOneWesolowskiCallback(
+        integer& D,
+        uint64_t wanted_iter,
+        uint32_t k,
+        uint32_t l,
+        uint64_t limit,
+        integer& B,
+        bool use_getblock_opt,
+        uint64_t progress_interval,
+        ChiavdfProgressCallback progress_cb,
+        void* progress_user_data)
+        : WesolowskiCallback(D),
+          wanted_iter(wanted_iter),
+          k(k),
+          l(l),
+          kl(static_cast<uint64_t>(k) * static_cast<uint64_t>(l)),
+          limit(limit),
+          B(B),
+          progress_interval(progress_interval),
+          progress_cb(progress_cb),
+          progress_user_data(progress_user_data),
+          next_progress(progress_interval),
+          use_getblock_opt(use_getblock_opt),
+          stats_enabled(streaming_stats_enabled.load(std::memory_order_relaxed)) {
+        form id = form::identity(D);
+        buckets.resize(static_cast<size_t>(l) * (1ULL << k), id);
+
+        if (use_getblock_opt) {
+            getblock_ok = init_getblock_opt_state();
+        }
+    }
+
+    void OnIteration(int type, void* data, uint64_t iteration) override {
+        iteration++;
+        if (iteration > wanted_iter) {
+            return;
+        }
+
+        if (progress_cb != nullptr && progress_interval != 0 && iteration >= next_progress) {
+            progress_cb(next_progress, progress_user_data);
+            next_progress += progress_interval;
+        }
+
+        if (iteration % kl == 0) {
+            uint64_t pos = iteration / kl;
+            if (pos < limit) {
+                form checkpoint;
+                auto started_at = std::chrono::steady_clock::time_point{};
+                if (stats_enabled) {
+                    started_at = std::chrono::steady_clock::now();
+                }
+                SetForm(type, data, &checkpoint);
+                process_checkpoint(pos, checkpoint, /*record_stats=*/true);
+                if (stats_enabled) {
+                    checkpoint_event_total_ns += static_cast<uint64_t>(
+                        std::chrono::duration_cast<std::chrono::nanoseconds>(
+                            std::chrono::steady_clock::now() - started_at)
+                            .count());
+                }
+            }
+        }
+
+        if (iteration == wanted_iter) {
+            SetForm(type, data, &result);
+            has_result = true;
+        }
+    }
+
+    void process_checkpoint(uint64_t i, const form& checkpoint, bool record_stats) {
+        const bool do_stats = stats_enabled && record_stats;
+        auto started_at = std::chrono::steady_clock::time_point{};
+        if (do_stats) {
+            started_at = std::chrono::steady_clock::now();
+        }
+
+        uint64_t local_updates = 0;
+        for (uint32_t j = 0; j < l; j++) {
+            uint64_t p = i * static_cast<uint64_t>(l) + static_cast<uint64_t>(j);
+            uint64_t needed = static_cast<uint64_t>(k) * (p + 1);
+            if (wanted_iter < needed) {
+                break;
+            }
+            uint64_t b = use_getblock_opt ? get_block_opt(p) : get_block(p, k, wanted_iter, B);
+            if (do_stats) {
+                local_updates++;
+            }
+            nucomp_form(bucket(j, b), bucket(j, b), checkpoint, D, L);
+        }
+
+        if (do_stats) {
+            checkpoint_calls++;
+            bucket_updates += local_updates;
+            checkpoint_total_ns += static_cast<uint64_t>(
+                std::chrono::duration_cast<std::chrono::nanoseconds>(
+                    std::chrono::steady_clock::now() - started_at)
+                    .count());
+        }
+    }
+
+    bool init_ok() const { return getblock_ok; }
+
+    bool ok() const { return has_result; }
+
+    const form& y() const { return result; }
+
+    form finalize_proof() {
+        auto started_at = std::chrono::steady_clock::time_point{};
+        if (stats_enabled) {
+            started_at = std::chrono::steady_clock::now();
+        }
+
+        PulmarkReducer reducer;
+        form id = form::identity(D);
+
+        uint64_t k1 = k / 2;
+        uint64_t k0 = k - k1;
+        form x = id;
+
+        for (int64_t j = static_cast<int64_t>(l) - 1; j >= 0; j--) {
+            x = FastPowFormNucomp(x, D, integer(static_cast<uint64_t>(1) << k), L, reducer);
+
+            for (uint64_t b1 = 0; b1 < (1ULL << k1); b1++) {
+                form z = id;
+                for (uint64_t b0 = 0; b0 < (1ULL << k0); b0++) {
+                    nucomp_form(z, z, bucket(static_cast<uint32_t>(j), b1 * (1ULL << k0) + b0), D, L);
+                }
+                z = FastPowFormNucomp(
+                    z,
+                    D,
+                    integer(static_cast<uint64_t>(b1 * (1ULL << k0))),
+                    L,
+                    reducer);
+                nucomp_form(x, x, z, D, L);
+            }
+
+            for (uint64_t b0 = 0; b0 < (1ULL << k0); b0++) {
+                form z = id;
+                for (uint64_t b1 = 0; b1 < (1ULL << k1); b1++) {
+                    nucomp_form(z, z, bucket(static_cast<uint32_t>(j), b1 * (1ULL << k0) + b0), D, L);
+                }
+                z = FastPowFormNucomp(z, D, integer(b0), L, reducer);
+                nucomp_form(x, x, z, D, L);
+            }
+        }
+
+        reducer.reduce(x);
+
+        if (stats_enabled) {
+            finalize_total_ns += static_cast<uint64_t>(
+                std::chrono::duration_cast<std::chrono::nanoseconds>(
+                    std::chrono::steady_clock::now() - started_at)
+                    .count());
+        }
+        return x;
+    }
+
+    bool stats_ok() const { return stats_enabled; }
+
+    LastStreamingStats stats() const {
+        LastStreamingStats out;
+        out.checkpoint_total_ns = checkpoint_total_ns;
+        out.checkpoint_event_total_ns = checkpoint_event_total_ns;
+        out.finalize_total_ns = finalize_total_ns;
+        out.checkpoint_calls = checkpoint_calls;
+        out.bucket_updates = bucket_updates;
+        out.set = stats_enabled;
+        return out;
+    }
+
+  private:
+    form& bucket(uint32_t j, uint64_t b) {
+        size_t idx = static_cast<size_t>(j) * (1ULL << k) + static_cast<size_t>(b);
+        return buckets[idx];
+    }
+
+    const form& bucket(uint32_t j, uint64_t b) const {
+        size_t idx = static_cast<size_t>(j) * (1ULL << k) + static_cast<size_t>(b);
+        return buckets[idx];
+    }
+
+    uint64_t wanted_iter;
+    uint32_t k;
+    uint32_t l;
+    uint64_t kl;
+    uint64_t limit;
+    integer B;
+    uint64_t progress_interval;
+    ChiavdfProgressCallback progress_cb;
+    void* progress_user_data;
+    uint64_t next_progress;
+
+    std::vector<form> buckets;
+    form result;
+    bool has_result = false;
+
+    bool use_getblock_opt;
+    bool getblock_ok = true;
+    uint64_t getblock_next_p = 0;
+    integer getblock_inv_2k;
+    integer getblock_r;
+    integer getblock_tmp;
+
+    bool stats_enabled;
+    uint64_t checkpoint_total_ns = 0;
+    uint64_t checkpoint_event_total_ns = 0;
+    uint64_t finalize_total_ns = 0;
+    uint64_t checkpoint_calls = 0;
+    uint64_t bucket_updates = 0;
+
+    bool init_getblock_opt_state() {
+        if (k == 0) {
+            return false;
+        }
+        uint64_t k_u64 = static_cast<uint64_t>(k);
+        if (wanted_iter < k_u64) {
+            return true;
+        }
+
+        integer two_k_mod = FastPow(2, k_u64, B);
+        if (mpz_invert(getblock_inv_2k.impl, two_k_mod.impl, B.impl) == 0) {
+            return false;
+        }
+
+        getblock_r = FastPow(2, wanted_iter - k_u64, B);
+        getblock_next_p = 0;
+        return true;
+    }
+
+    uint64_t get_block_opt(uint64_t p) {
+        if (!getblock_ok || wanted_iter < static_cast<uint64_t>(k)) {
+            return get_block(p, k, wanted_iter, B);
+        }
+
+        // Expected call pattern is sequential `p`. If we ever get out of sync,
+        // advance state forward or fall back to the slow mapping.
+        if (p < getblock_next_p) {
+            return get_block(p, k, wanted_iter, B);
+        }
+        while (getblock_next_p < p) {
+            mpz_mul(getblock_r.impl, getblock_r.impl, getblock_inv_2k.impl);
+            mpz_mod(getblock_r.impl, getblock_r.impl, B.impl);
+            getblock_next_p++;
+        }
+
+        mpz_mul_2exp(getblock_tmp.impl, getblock_r.impl, k);
+        mpz_fdiv_q(getblock_tmp.impl, getblock_tmp.impl, B.impl);
+        uint64_t b = mpz_get_ui(getblock_tmp.impl);
+
+        mpz_mul(getblock_r.impl, getblock_r.impl, getblock_inv_2k.impl);
+        mpz_mod(getblock_r.impl, getblock_r.impl, B.impl);
+        getblock_next_p++;
+
+        return b;
+    }
+};
+
+ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_impl(
+    const uint8_t* challenge_hash,
+    size_t challenge_size,
+    const uint8_t* x_s,
+    size_t x_s_size,
+    const uint8_t* y_ref_s,
+    size_t y_ref_s_size,
+    size_t discriminant_size_bits,
+    uint64_t num_iterations,
+    uint64_t progress_interval,
+    ChiavdfProgressCallback progress_cb,
+    void* progress_user_data,
+    bool use_getblock_opt) {
+    std::call_once(init_once, init_chiavdf_fast);
+
+    last_streaming_stats = LastStreamingStats{};
+
+    if (challenge_hash == nullptr || challenge_size == 0 || x_s == nullptr || x_s_size == 0 ||
+        y_ref_s == nullptr || y_ref_s_size == 0) {
+        return empty_result();
+    }
+    if (num_iterations == 0) {
+        return empty_result();
+    }
+
+    std::vector<uint8_t> challenge_hash_bytes(challenge_hash, challenge_hash + challenge_size);
+    integer D = CreateDiscriminant(challenge_hash_bytes, static_cast<int>(discriminant_size_bits));
+    integer L = root(-D, 4);
+
+    form x = DeserializeForm(D, x_s, x_s_size);
+    form y_ref = DeserializeForm(D, y_ref_s, y_ref_s_size);
+
+    uint32_t k;
+    uint32_t l;
+    bool tuned = false;
+    const uint64_t budget =
+        bucket_memory_budget_bytes.load(std::memory_order_relaxed);
+    if (num_iterations >= (1 << 16)) {
+        tuned = tune_streaming_parameters(num_iterations, discriminant_size_bits, budget, l, k);
+    }
+    if (!tuned) {
+        if (num_iterations >= (1 << 16)) {
+            ApproximateParameters(num_iterations, l, k);
+        } else {
+            k = 10;
+            l = 1;
+        }
+    }
+    if (k == 0) {
+        k = 1;
+    }
+    if (l == 0) {
+        l = 1;
+    }
+
+    last_streaming_parameters.k = k;
+    last_streaming_parameters.l = l;
+    last_streaming_parameters.tuned = tuned;
+    last_streaming_parameters.set = true;
+
+    uint64_t kl = static_cast<uint64_t>(k) * static_cast<uint64_t>(l);
+    uint64_t limit = num_iterations / kl;
+    if (num_iterations % kl) {
+        limit++;
+    }
+
+    integer B = GetB(D, x, y_ref);
+
+    std::atomic<bool> stopped(false);
+    StreamingOneWesolowskiCallback weso(
+        D,
+        num_iterations,
+        k,
+        l,
+        limit,
+        B,
+        use_getblock_opt,
+        progress_interval,
+        progress_cb,
+        progress_user_data);
+
+    if (!weso.init_ok()) {
+        return empty_result();
+    }
+
+    weso.process_checkpoint(/*i=*/0, x, /*record_stats=*/false);
+
+    FastStorage* fast_storage = nullptr;
+    repeated_square(num_iterations, x, D, L, &weso, fast_storage, stopped);
+
+    if (!weso.ok()) {
+        return empty_result();
+    }
+    if (!(weso.y() == y_ref)) {
+        return empty_result();
+    }
+
+    form proof_form = weso.finalize_proof();
+
+    if (weso.stats_ok()) {
+        last_streaming_stats = weso.stats();
+    }
+
+    int d_bits = D.num_bits();
+    std::vector<unsigned char> y_serialized = SerializeForm(y_ref, d_bits);
+    std::vector<unsigned char> proof_serialized = SerializeForm(proof_form, d_bits);
+
+    if (y_serialized.empty() || proof_serialized.empty()) {
+        return empty_result();
+    }
+
+    const size_t total = y_serialized.size() + proof_serialized.size();
+    uint8_t* out = new uint8_t[total];
+    std::copy(y_serialized.begin(), y_serialized.end(), out);
+    std::copy(proof_serialized.begin(), proof_serialized.end(), out + y_serialized.size());
+    return ChiavdfByteArray{out, total};
+}
+} // namespace
+
+extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast(
+    const uint8_t* challenge_hash,
+    size_t challenge_size,
+    const uint8_t* x_s,
+    size_t x_s_size,
+    size_t discriminant_size_bits,
+    uint64_t num_iterations) {
+    return chiavdf_prove_one_weso_fast_with_progress(
+        challenge_hash,
+        challenge_size,
+        x_s,
+        x_s_size,
+        discriminant_size_bits,
+        num_iterations,
+        /*progress_interval=*/0,
+        /*progress_cb=*/nullptr,
+        /*progress_user_data=*/nullptr);
+}
+
+extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_with_progress(
+    const uint8_t* challenge_hash,
+    size_t challenge_size,
+    const uint8_t* x_s,
+    size_t x_s_size,
+    size_t discriminant_size_bits,
+    uint64_t num_iterations,
+    uint64_t progress_interval,
+    ChiavdfProgressCallback progress_cb,
+    void* progress_user_data) {
+    try {
+        std::call_once(init_once, init_chiavdf_fast);
+
+        if (challenge_hash == nullptr || challenge_size == 0 || x_s == nullptr || x_s_size == 0) {
+            return empty_result();
+        }
+        if (num_iterations == 0) {
+            return empty_result();
+        }
+
+        std::vector<uint8_t> challenge_hash_bytes(challenge_hash, challenge_hash + challenge_size);
+        integer D = CreateDiscriminant(challenge_hash_bytes, static_cast<int>(discriminant_size_bits));
+        integer L = root(-D, 4);
+
+        form x = DeserializeForm(D, x_s, x_s_size);
+
+        std::atomic<bool> stopped(false);
+        ProgressOneWesolowskiCallback weso(
+            D,
+            x,
+            num_iterations,
+            progress_interval,
+            progress_cb,
+            progress_user_data);
+
+        // Run the fast repeated-squaring engine to `num_iterations`.
+        // The callback stores all intermediates needed for the proof.
+        FastStorage* fast_storage = nullptr;
+        repeated_square(num_iterations, x, D, L, &weso, fast_storage, stopped);
+
+        // Now generate the compact proof from the stored intermediates.
+        Proof proof = ProveOneWesolowski(num_iterations, D, x, &weso, stopped);
+        if (proof.y.empty() || proof.proof.empty()) {
+            return empty_result();
+        }
+
+        const size_t total = proof.y.size() + proof.proof.size();
+        uint8_t* out = new uint8_t[total];
+        std::copy(proof.y.begin(), proof.y.end(), out);
+        std::copy(proof.proof.begin(), proof.proof.end(), out + proof.y.size());
+        return ChiavdfByteArray{out, total};
+    } catch (...) {
+        return empty_result();
+    }
+}
+
+extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming(
+    const uint8_t* challenge_hash,
+    size_t challenge_size,
+    const uint8_t* x_s,
+    size_t x_s_size,
+    const uint8_t* y_ref_s,
+    size_t y_ref_s_size,
+    size_t discriminant_size_bits,
+    uint64_t num_iterations) {
+    return chiavdf_prove_one_weso_fast_streaming_with_progress(
+        challenge_hash,
+        challenge_size,
+        x_s,
+        x_s_size,
+        y_ref_s,
+        y_ref_s_size,
+        discriminant_size_bits,
+        num_iterations,
+        /*progress_interval=*/0,
+        /*progress_cb=*/nullptr,
+        /*progress_user_data=*/nullptr);
+}
+
+extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_with_progress(
+    const uint8_t* challenge_hash,
+    size_t challenge_size,
+    const uint8_t* x_s,
+    size_t x_s_size,
+    const uint8_t* y_ref_s,
+    size_t y_ref_s_size,
+    size_t discriminant_size_bits,
+    uint64_t num_iterations,
+    uint64_t progress_interval,
+    ChiavdfProgressCallback progress_cb,
+    void* progress_user_data) {
+    try {
+        return chiavdf_prove_one_weso_fast_streaming_impl(
+            challenge_hash,
+            challenge_size,
+            x_s,
+            x_s_size,
+            y_ref_s,
+            y_ref_s_size,
+            discriminant_size_bits,
+            num_iterations,
+            progress_interval,
+            progress_cb,
+            progress_user_data,
+            /*use_getblock_opt=*/false);
+    } catch (...) {
+        return empty_result();
+    }
+}
+
+extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt(
+    const uint8_t* challenge_hash,
+    size_t challenge_size,
+    const uint8_t* x_s,
+    size_t x_s_size,
+    const uint8_t* y_ref_s,
+    size_t y_ref_s_size,
+    size_t discriminant_size_bits,
+    uint64_t num_iterations) {
+    return chiavdf_prove_one_weso_fast_streaming_getblock_opt_with_progress(
+        challenge_hash,
+        challenge_size,
+        x_s,
+        x_s_size,
+        y_ref_s,
+        y_ref_s_size,
+        discriminant_size_bits,
+        num_iterations,
+        /*progress_interval=*/0,
+        /*progress_cb=*/nullptr,
+        /*progress_user_data=*/nullptr);
+}
+
+extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt_with_progress(
+    const uint8_t* challenge_hash,
+    size_t challenge_size,
+    const uint8_t* x_s,
+    size_t x_s_size,
+    const uint8_t* y_ref_s,
+    size_t y_ref_s_size,
+    size_t discriminant_size_bits,
+    uint64_t num_iterations,
+    uint64_t progress_interval,
+    ChiavdfProgressCallback progress_cb,
+    void* progress_user_data) {
+    try {
+        return chiavdf_prove_one_weso_fast_streaming_impl(
+            challenge_hash,
+            challenge_size,
+            x_s,
+            x_s_size,
+            y_ref_s,
+            y_ref_s_size,
+            discriminant_size_bits,
+            num_iterations,
+            progress_interval,
+            progress_cb,
+            progress_user_data,
+            /*use_getblock_opt=*/true);
+    } catch (...) {
+        return empty_result();
+    }
+}
+
+extern "C" void chiavdf_set_bucket_memory_budget_bytes(uint64_t bytes) {
+    bucket_memory_budget_bytes.store(bytes, std::memory_order_relaxed);
+}
+
+extern "C" void chiavdf_set_enable_streaming_stats(bool enable) {
+    streaming_stats_enabled.store(enable, std::memory_order_relaxed);
+    last_streaming_stats = LastStreamingStats{};
+}
+
+extern "C" bool chiavdf_get_last_streaming_parameters(uint32_t* out_k, uint32_t* out_l, bool* out_tuned) {
+    if (out_k == nullptr || out_l == nullptr || out_tuned == nullptr) {
+        return false;
+    }
+    if (!last_streaming_parameters.set) {
+        return false;
+    }
+    *out_k = last_streaming_parameters.k;
+    *out_l = last_streaming_parameters.l;
+    *out_tuned = last_streaming_parameters.tuned;
+    return true;
+}
+
+extern "C" bool chiavdf_get_last_streaming_stats(
+    uint64_t* out_checkpoint_total_ns,
+    uint64_t* out_checkpoint_event_total_ns,
+    uint64_t* out_finalize_total_ns,
+    uint64_t* out_checkpoint_calls,
+    uint64_t* out_bucket_updates) {
+    if (out_checkpoint_total_ns == nullptr || out_checkpoint_event_total_ns == nullptr ||
+        out_finalize_total_ns == nullptr || out_checkpoint_calls == nullptr ||
+        out_bucket_updates == nullptr) {
+        return false;
+    }
+    if (!last_streaming_stats.set) {
+        return false;
+    }
+    *out_checkpoint_total_ns = last_streaming_stats.checkpoint_total_ns;
+    *out_checkpoint_event_total_ns = last_streaming_stats.checkpoint_event_total_ns;
+    *out_finalize_total_ns = last_streaming_stats.finalize_total_ns;
+    *out_checkpoint_calls = last_streaming_stats.checkpoint_calls;
+    *out_bucket_updates = last_streaming_stats.bucket_updates;
+    return true;
+}
+
+extern "C" void chiavdf_free_byte_array(ChiavdfByteArray array) { delete[] array.data; }
diff --git a/src/c_bindings/fast_wrapper.h b/src/c_bindings/fast_wrapper.h
new file mode 100644
index 00000000..bf33f320
--- /dev/null
+++ b/src/c_bindings/fast_wrapper.h
@@ -0,0 +1,145 @@
+#pragma once
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+    uint8_t* data;
+    size_t length;
+} ChiavdfByteArray;
+
+typedef void (*ChiavdfProgressCallback)(uint64_t iters_done, void* user_data);
+
+// Configure the per-process memory budget used by the parameter tuner when
+// selecting `(k,l)` for streaming/bucket-based proving.
+//
+// The budget is per worker process (not global across multiple processes).
+//
+// If `bytes` is 0, the default chiavdf heuristic is used.
+void chiavdf_set_bucket_memory_budget_bytes(uint64_t bytes);
+
+// Debug helper: returns the `(k,l)` parameters selected for the most recent
+// streaming proof computed on the current thread.
+//
+// Returns true if parameters are available.
+bool chiavdf_get_last_streaming_parameters(uint32_t* out_k, uint32_t* out_l, bool* out_tuned);
+
+// Enable lightweight timing counters for the streaming prover.
+//
+// When enabled, the native library records basic timing counters for the most
+// recent streaming proof computed on the current thread. This is intended for
+// benchmarking and tuning; production runs should keep this disabled to avoid
+// extra overhead.
+void chiavdf_set_enable_streaming_stats(bool enable);
+
+// Debug helper: returns timing counters for the most recent streaming proof on
+// the current thread.
+//
+// Returns true if stats are available (i.e. stats enabled and a streaming proof
+// was computed successfully).
+bool chiavdf_get_last_streaming_stats(
+    uint64_t* out_checkpoint_total_ns,
+    uint64_t* out_checkpoint_event_total_ns,
+    uint64_t* out_finalize_total_ns,
+    uint64_t* out_checkpoint_calls,
+    uint64_t* out_bucket_updates);
+
+// Computes a compact (witness_type=0) Wesolowski proof using the fast engine.
+//
+// On success, returns `y || proof` where:
+// - `y` is the serialized output form (typically 100 bytes for 1024-bit discriminants)
+// - `proof` is the serialized witness form (same size as `y`)
+//
+// On failure, returns `{NULL, 0}`.
+ChiavdfByteArray chiavdf_prove_one_weso_fast(
+    const uint8_t* challenge_hash,
+    size_t challenge_size,
+    const uint8_t* x_s,
+    size_t x_s_size,
+    size_t discriminant_size_bits,
+    uint64_t num_iterations);
+
+// Same as `chiavdf_prove_one_weso_fast`, but optionally invokes `progress_cb` from
+// the proving thread every `progress_interval` iterations completed.
+//
+// If `progress_cb` is NULL or `progress_interval` is 0, no progress is reported.
+ChiavdfByteArray chiavdf_prove_one_weso_fast_with_progress(
+    const uint8_t* challenge_hash,
+    size_t challenge_size,
+    const uint8_t* x_s,
+    size_t x_s_size,
+    size_t discriminant_size_bits,
+    uint64_t num_iterations,
+    uint64_t progress_interval,
+    ChiavdfProgressCallback progress_cb,
+    void* progress_user_data);
+
+// Computes a compact (witness_type=0) Wesolowski proof using the "streaming"
+// bucket-accumulation algorithm (Trick 1), which requires the expected output
+// `y_ref` up front (as used by bluebox compaction jobs).
+//
+// On success, returns `y || proof` (same format as `chiavdf_prove_one_weso_fast`).
+ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming(
+    const uint8_t* challenge_hash,
+    size_t challenge_size,
+    const uint8_t* x_s,
+    size_t x_s_size,
+    const uint8_t* y_ref_s,
+    size_t y_ref_s_size,
+    size_t discriminant_size_bits,
+    uint64_t num_iterations);
+
+// Same as `chiavdf_prove_one_weso_fast_streaming`, but optionally invokes
+// `progress_cb` from the proving thread every `progress_interval` iterations.
+ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_with_progress(
+    const uint8_t* challenge_hash,
+    size_t challenge_size,
+    const uint8_t* x_s,
+    size_t x_s_size,
+    const uint8_t* y_ref_s,
+    size_t y_ref_s_size,
+    size_t discriminant_size_bits,
+    uint64_t num_iterations,
+    uint64_t progress_interval,
+    ChiavdfProgressCallback progress_cb,
+    void* progress_user_data);
+
+// Same as `chiavdf_prove_one_weso_fast_streaming`, but with an optimized
+// implementation of the `GetBlock()` mapping (avoids per-block modular
+// exponentiation without allocating a full `GetBlock` table).
+ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt(
+    const uint8_t* challenge_hash,
+    size_t challenge_size,
+    const uint8_t* x_s,
+    size_t x_s_size,
+    const uint8_t* y_ref_s,
+    size_t y_ref_s_size,
+    size_t discriminant_size_bits,
+    uint64_t num_iterations);
+
+// Same as `chiavdf_prove_one_weso_fast_streaming_getblock_opt`, but optionally
+// invokes `progress_cb` from the proving thread every `progress_interval`
+// iterations.
+ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt_with_progress(
+    const uint8_t* challenge_hash,
+    size_t challenge_size,
+    const uint8_t* x_s,
+    size_t x_s_size,
+    const uint8_t* y_ref_s,
+    size_t y_ref_s_size,
+    size_t discriminant_size_bits,
+    uint64_t num_iterations,
+    uint64_t progress_interval,
+    ChiavdfProgressCallback progress_cb,
+    void* progress_user_data);
+
+void chiavdf_free_byte_array(ChiavdfByteArray array);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/threading.h b/src/threading.h
index 3244b3c3..8354d824 100644
--- a/src/threading.h
+++ b/src/threading.h
@@ -566,8 +566,8 @@ struct alignas(64) thread_counter {
     }
 };
 
-thread_counter master_counter[100];
-thread_counter slave_counter[100];
+thread_counter master_counter[512];
+thread_counter slave_counter[512];
 
 struct thread_state {
     int pairindex;
diff --git a/src/vdf.h b/src/vdf.h
index 7bb911f9..f24c09c6 100644
--- a/src/vdf.h
+++ b/src/vdf.h
@@ -87,6 +87,18 @@ std::mutex new_event_mutex, cout_lock;
 bool debug_mode = false;
 bool fast_algorithm = false;
 bool two_weso = false;
+bool quiet_mode = false;
+
+// vdf_fast uses shared master/slave counters keyed by `square_state.pairindex`.
+// The upstream chiavdf binaries run one VDF per process and hardcode `pairindex=0`.
+// In embedded/multi-worker setups (like WesoForge), multiple VDF computations can
+// run concurrently in the same process; they must not share a pairindex.
+inline int vdf_fast_pairindex() {
+    constexpr int kSlots = int(sizeof(master_counter) / sizeof(master_counter[0]));
+    static std::atomic<int> next_slot{0};
+    thread_local int slot = next_slot.fetch_add(1, std::memory_order_relaxed) % kSlots;
+    return slot;
+}
 
 //always works
 void repeated_square_original(vdf_original &vdfo, form& f, const integer&, const integer&, uint64 base, uint64 iterations, INUDUPLListener *nuduplListener) {
@@ -195,7 +207,7 @@ void repeated_square(uint64_t iterations, form f, const integer& D, const intege
 #if (defined(ARCH_X86) || defined(ARCH_X64)) && !defined(CHIA_DISABLE_ASM)
         // x86/x64: use the phased pipeline.
         square_state_type square_state;
-        square_state.pairindex = 0;
+        square_state.pairindex = vdf_fast_pairindex();
         actual_iterations = repeated_square_fast(square_state, f, D, L, num_iterations, batch_size, weso);
 #else
         // Non-x86: use the C++ NUDUPL path (faster and lower maintenance than the phased pipeline).
@@ -298,10 +310,12 @@ void repeated_square(uint64_t iterations, form f, const integer& D, const intege
             }
         #endif
     }
-    {
-        // this shouldn't be needed but avoids some false positive in TSAN
-        std::lock_guard<std::mutex> lk(cout_lock);
-        std::cout << "VDF loop finished. Total iters: " << num_iterations << "\n" << std::flush;
+    if (!quiet_mode) {
+        {
+            // this shouldn't be needed but avoids some false positive in TSAN
+            std::lock_guard<std::mutex> lk(cout_lock);
+            std::cout << "VDF loop finished. Total iters: " << num_iterations << "\n" << std::flush;
+        }
     }
 
     #ifdef VDF_TEST
@@ -337,11 +351,6 @@ Proof ProveOneWesolowski(uint64_t iters, integer& D, form f, OneWesolowskiCallba
     proof_serialized = SerializeForm(proof_form, d_bits);
     Proof proof(y_serialized, proof_serialized);
     proof.witness_type = 0;
-    {
-        // this shouldn't be needed but avoids some false positive in TSAN
-        std::lock_guard<std::mutex> lk(cout_lock);
-        std::cout << "Got simple weso proof: " << proof.hex() << "\n";
-    }
     return proof;
 }
 

From 7be07522d02eca9fc65ca9b34b96eb2057659e76 Mon Sep 17 00:00:00 2001
From: Gene Hoffman <hoffmang@hoffmang.com>
Date: Mon, 23 Feb 2026 23:41:52 -0800
Subject: [PATCH 02/21] Fix non-x86 build break in vdf_fast_pairindex.

Guard the fast pairindex slot selection behind the existing x86/asm feature checks and return slot 0 on non-x86 targets, where threading counters are not compiled.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 src/vdf.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/vdf.h b/src/vdf.h
index f24c09c6..c2f8834f 100644
--- a/src/vdf.h
+++ b/src/vdf.h
@@ -94,10 +94,14 @@ bool quiet_mode = false;
 // In embedded/multi-worker setups (like WesoForge), multiple VDF computations can
 // run concurrently in the same process; they must not share a pairindex.
 inline int vdf_fast_pairindex() {
+#if (defined(ARCH_X86) || defined(ARCH_X64)) && !defined(CHIA_DISABLE_ASM)
     constexpr int kSlots = int(sizeof(master_counter) / sizeof(master_counter[0]));
     static std::atomic<int> next_slot{0};
     thread_local int slot = next_slot.fetch_add(1, std::memory_order_relaxed) % kSlots;
     return slot;
+#else
+    return 0;
+#endif
 }
 
 //always works

From 3755be28167c172b1bb6115b081d1d23903a6d98 Mon Sep 17 00:00:00 2001
From: Gene Hoffman <hoffmang@hoffmang.com>
Date: Mon, 23 Feb 2026 23:57:04 -0800
Subject: [PATCH 03/21] Ensure cmake is present on macOS CI runners.

Install cmake via Homebrew and export its bin path in the C libraries and wheel workflows so self-hosted macOS jobs don't fail when cmake is missing from PATH.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .github/workflows/build-c-libraries.yml | 11 +++++++++++
 .github/workflows/build.yml             | 11 +++++++++++
 2 files changed, 22 insertions(+)

diff --git a/.github/workflows/build-c-libraries.yml b/.github/workflows/build-c-libraries.yml
index 00ca38c9..db833104 100644
--- a/.github/workflows/build-c-libraries.yml
+++ b/.github/workflows/build-c-libraries.yml
@@ -82,6 +82,17 @@ jobs:
         fetch-depth: 1
         path: mpir_gc_x64
 
+    - name: Ensure cmake available (macOS)
+      if: matrix.os.matrix == 'macos'
+      shell: bash
+      run: |
+        brew ls --versions cmake >/dev/null 2>&1 || brew install cmake
+        CMAKE_BIN="$(brew --prefix cmake)/bin"
+        if [ -d "$CMAKE_BIN" ]; then
+          echo "$CMAKE_BIN" >> "$GITHUB_PATH"
+        fi
+        cmake --version
+
     - name: Build
       working-directory: src
       env:
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index cd6bec02..4ad967ec 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -102,6 +102,17 @@ jobs:
       with:
         python-version: ${{ matrix.python.major-dot-minor }}
 
+    - name: Ensure cmake available (macOS)
+      if: matrix.os.matrix == 'macos'
+      shell: bash
+      run: |
+        brew ls --versions cmake >/dev/null 2>&1 || brew install cmake
+        CMAKE_BIN="$(brew --prefix cmake)/bin"
+        if [ -d "$CMAKE_BIN" ]; then
+          echo "$CMAKE_BIN" >> "$GITHUB_PATH"
+        fi
+        cmake --version
+
     - name: Install pipx
       run: |
         pip install pipx

From 073427ac5cc60adea976104e1aff47bd9fb2ccc4 Mon Sep 17 00:00:00 2001
From: Gene Hoffman <hoffmang@hoffmang.com>
Date: Tue, 24 Feb 2026 00:00:51 -0800
Subject: [PATCH 04/21] Improve fast-path batch replay handling and harden
 pairindex slot allocation.

Track and roll back per-batch checkpoints when replaying a failed fast batch, and switch pairindex slot allocation to unsigned atomics to avoid negative modulo indexing after counter wraparound.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 src/c_bindings/fast_wrapper.cpp | 44 ++++++++++++++++++++++++++++++++-
 src/callback.h                  |  8 ++++++
 src/vdf.h                       | 12 ++++++---
 3 files changed, 60 insertions(+), 4 deletions(-)

diff --git a/src/c_bindings/fast_wrapper.cpp b/src/c_bindings/fast_wrapper.cpp
index 198d0a87..5f01e905 100644
--- a/src/c_bindings/fast_wrapper.cpp
+++ b/src/c_bindings/fast_wrapper.cpp
@@ -243,6 +243,9 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
                 }
                 SetForm(type, data, &checkpoint);
                 process_checkpoint(pos, checkpoint, /*record_stats=*/true);
+                if (iteration >= batch_start_iteration && iteration <= batch_end_iteration) {
+                    current_batch_checkpoints.push_back(BatchCheckpoint{pos, checkpoint});
+                }
                 if (stats_enabled) {
                     checkpoint_event_total_ns += static_cast<uint64_t>(
                         std::chrono::duration_cast<std::chrono::nanoseconds>(
@@ -258,7 +261,44 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
         }
     }
 
+    void OnBatchStart(uint64_t base_iteration, uint64_t batch_size) override {
+        current_batch_checkpoints.clear();
+        if (batch_size == 0) {
+            batch_start_iteration = 1;
+            batch_end_iteration = 0;
+            return;
+        }
+        batch_start_iteration = base_iteration + 1;
+        if (std::numeric_limits<uint64_t>::max() - base_iteration < batch_size) {
+            batch_end_iteration = std::numeric_limits<uint64_t>::max();
+        } else {
+            batch_end_iteration = base_iteration + batch_size;
+        }
+    }
+
+    void OnBatchReplay(uint64_t base_iteration, uint64_t batch_size) override {
+        for (const BatchCheckpoint& entry : current_batch_checkpoints) {
+            rollback_checkpoint(entry.index, entry.checkpoint);
+        }
+        OnBatchStart(base_iteration, batch_size);
+    }
+
     void process_checkpoint(uint64_t i, const form& checkpoint, bool record_stats) {
+        apply_checkpoint(i, checkpoint, record_stats);
+    }
+
+  private:
+    struct BatchCheckpoint {
+        uint64_t index;
+        form checkpoint;
+    };
+
+    void rollback_checkpoint(uint64_t i, const form& checkpoint) {
+        form inverse_checkpoint = checkpoint.inverse();
+        apply_checkpoint(i, inverse_checkpoint, /*record_stats=*/false);
+    }
+
+    void apply_checkpoint(uint64_t i, const form& checkpoint, bool record_stats) {
         const bool do_stats = stats_enabled && record_stats;
         auto started_at = std::chrono::steady_clock::time_point{};
         if (do_stats) {
@@ -359,7 +399,6 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
         return out;
     }
 
-  private:
     form& bucket(uint32_t j, uint64_t b) {
         size_t idx = static_cast<size_t>(j) * (1ULL << k) + static_cast<size_t>(b);
         return buckets[idx];
@@ -391,6 +430,9 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
     integer getblock_inv_2k;
     integer getblock_r;
     integer getblock_tmp;
+    uint64_t batch_start_iteration = 1;
+    uint64_t batch_end_iteration = 0;
+    std::vector<BatchCheckpoint> current_batch_checkpoints;
 
     bool stats_enabled;
     uint64_t checkpoint_total_ns = 0;
diff --git a/src/callback.h b/src/callback.h
index f4764bbf..9ebf3543 100644
--- a/src/callback.h
+++ b/src/callback.h
@@ -73,6 +73,14 @@ class WesolowskiCallback :public INUDUPLListener {
     }
 
     virtual void OnIteration(int type, void *data, uint64_t iteration) = 0;
+    virtual void OnBatchStart(uint64_t base_iteration, uint64_t batch_size) {
+        (void)base_iteration;
+        (void)batch_size;
+    }
+    virtual void OnBatchReplay(uint64_t base_iteration, uint64_t batch_size) {
+        (void)base_iteration;
+        (void)batch_size;
+    }
 
     std::unique_ptr<form[]> forms;
     size_t forms_capacity = 0;
diff --git a/src/vdf.h b/src/vdf.h
index c2f8834f..8ca75d8c 100644
--- a/src/vdf.h
+++ b/src/vdf.h
@@ -95,9 +95,9 @@ bool quiet_mode = false;
 // run concurrently in the same process; they must not share a pairindex.
 inline int vdf_fast_pairindex() {
 #if (defined(ARCH_X86) || defined(ARCH_X64)) && !defined(CHIA_DISABLE_ASM)
-    constexpr int kSlots = int(sizeof(master_counter) / sizeof(master_counter[0]));
-    static std::atomic<int> next_slot{0};
-    thread_local int slot = next_slot.fetch_add(1, std::memory_order_relaxed) % kSlots;
+    constexpr unsigned int kSlots = unsigned(sizeof(master_counter) / sizeof(master_counter[0]));
+    static std::atomic<unsigned int> next_slot{0};
+    thread_local int slot = int(next_slot.fetch_add(1u, std::memory_order_relaxed) % kSlots);
     return slot;
 #else
     return 0;
@@ -201,6 +201,9 @@ void repeated_square(uint64_t iterations, form f, const integer& D, const intege
         #endif
 
         uint64 batch_size=c_checkpoint_interval;
+        if (weso != NULL) {
+            weso->OnBatchStart(num_iterations, batch_size);
+        }
 
         #ifdef ENABLE_TRACK_CYCLES
             print( "track cycles enabled; results will be wrong" );
@@ -231,6 +234,9 @@ void repeated_square(uint64_t iterations, form f, const integer& D, const intege
 
         if (actual_iterations==~uint64(0)) {
             //corruption; f is unchanged. do the entire batch with the slow algorithm
+            if (weso != NULL) {
+                weso->OnBatchReplay(num_iterations, batch_size);
+            }
             repeated_square_original(*weso->vdfo, f, D, L, num_iterations, batch_size, weso);
             actual_iterations=batch_size;
 

From fd000ab88ded1cc3386acd892401cb626ae3cc14 Mon Sep 17 00:00:00 2001
From: Gene Hoffman <hoffmang@hoffmang.com>
Date: Tue, 24 Feb 2026 00:33:22 -0800
Subject: [PATCH 05/21] Clarify batch iteration indexing in streaming callback.

Document that batch bounds use completed-iteration base values while OnIteration is normalized to 1-based indices to avoid ambiguity in replay tracking.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 src/c_bindings/fast_wrapper.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/c_bindings/fast_wrapper.cpp b/src/c_bindings/fast_wrapper.cpp
index 5f01e905..61d24599 100644
--- a/src/c_bindings/fast_wrapper.cpp
+++ b/src/c_bindings/fast_wrapper.cpp
@@ -268,6 +268,8 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
             batch_end_iteration = 0;
             return;
         }
+        // `base_iteration` is the number of completed iterations before this batch.
+        // `OnIteration` normalizes to 1-based (`iteration++`), so this batch is [base+1, base+size].
         batch_start_iteration = base_iteration + 1;
         if (std::numeric_limits<uint64_t>::max() - base_iteration < batch_size) {
             batch_end_iteration = std::numeric_limits<uint64_t>::max();

From 3f82dc2786903e0029d92fc36d5f95f6a04d94bc Mon Sep 17 00:00:00 2001
From: Gene Hoffman <hoffmang@hoffmang.com>
Date: Tue, 24 Feb 2026 00:48:29 -0800
Subject: [PATCH 06/21] Add streaming tuner diagnostics and batch fast-wrapper
 APIs.

Expose missing batch C bindings and debug visibility so downstream Rust tests can validate tuner behavior end-to-end.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 pr1_upstream_ready.patch        | 1158 +++++++++++++++++++++++++++++++
 src/c_bindings/fast_wrapper.cpp |  168 +++++
 src/c_bindings/fast_wrapper.h   |   32 +
 3 files changed, 1358 insertions(+)
 create mode 100644 pr1_upstream_ready.patch

diff --git a/pr1_upstream_ready.patch b/pr1_upstream_ready.patch
new file mode 100644
index 00000000..b14a93bb
--- /dev/null
+++ b/pr1_upstream_ready.patch
@@ -0,0 +1,1158 @@
+diff --git a/src/Makefile.vdf-client b/src/Makefile.vdf-client
+index ed41963..ca55a95 100644
+--- a/src/Makefile.vdf-client
++++ b/src/Makefile.vdf-client
+@@ -6,9 +6,24 @@ else
+ NOPIE = -no-pie
+ endif
+ 
+-LDFLAGS += -flto $(NOPIE) -g
++# Optional: override `LTO=` to disable link-time optimization.
++LTO ?= -flto
++
++# Optional: set `PIC=1` to build position-independent objects (recommended when
++# linking chiavdf code into other PIE/shared-library binaries).
++PIC ?= 0
++ifeq ($(PIC),1)
++PICFLAGS = -fPIC
++PIEFLAGS =
++else
++PICFLAGS =
++PIEFLAGS = $(NOPIE)
++endif
++
++LDFLAGS += $(LTO) $(PIEFLAGS) -g
+ LDLIBS += -lgmpxx -lgmp -pthread
+-CXXFLAGS += -flto -std=c++1z -D VDF_MODE=0 -D FAST_MACHINE=1 -pthread $(NOPIE) -fvisibility=hidden
++CXXFLAGS += $(LTO) -std=c++1z -D VDF_MODE=0 -D FAST_MACHINE=1 -pthread $(PIEFLAGS) $(PICFLAGS) -fvisibility=hidden
++ASFLAGS += $(PICFLAGS)
+ ifeq ($(UNAME),Darwin)
+ CXXFLAGS += -D CHIAOSX=1
+ endif
+@@ -31,7 +46,7 @@ BINS = vdf_client prover_test 1weso_test 2weso_test vdf_bench
+ all: $(BINS)
+ 
+ clean:
+-	rm -f *.o hw/*.o $(BINS) compile_asm emu_hw_test hw_test hw_vdf_client emu_hw_vdf_client
++	rm -f *.o hw/*.o c_bindings/*.o $(BINS) compile_asm emu_hw_test hw_test hw_vdf_client emu_hw_vdf_client libchiavdf_fastc.a
+ 
+ $(BINS) avx512_test: %: %.o lzcnt.o asm_compiled.o avx2_asm_compiled.o avx512_asm_compiled.o
+ 	$(CXX) $(LDFLAGS) -o $@ $^ $(LDLIBS)
+@@ -39,7 +54,10 @@ $(BINS) avx512_test: %: %.o lzcnt.o asm_compiled.o avx2_asm_compiled.o avx512_as
+ $(addsuffix .o,$(BINS)) avx512_test.o: CXXFLAGS += $(OPT_CFLAGS)
+ 
+ lzcnt.o: refcode/lzcnt.c
+-	$(CC) -c refcode/lzcnt.c
++	$(CC) -c refcode/lzcnt.c $(OPT_CFLAGS) $(PICFLAGS)
++
++%.o: %.s
++	$(CC) -c $< -o $@ $(ASFLAGS)
+ 
+ asm_compiled.s: compile_asm
+ 	./compile_asm
+@@ -53,6 +71,22 @@ avx512_asm_compiled.s: compile_asm
+ compile_asm: compile_asm.o
+ 	$(CXX) $(LDFLAGS) -o $@ $^ $(LDLIBS)
+ 
++# ---------------------------------------------------------------------------
++# Static library: fast one-wesolowski proof (BBR integration)
++# ---------------------------------------------------------------------------
++
++FASTLIB = libchiavdf_fastc.a
++FASTLIB_OBJS = c_bindings/fast_wrapper.o lzcnt.o asm_compiled.o avx2_asm_compiled.o avx512_asm_compiled.o
++
++.PHONY: fastlib
++
++fastlib: $(FASTLIB)
++
++$(FASTLIB): $(FASTLIB_OBJS)
++	$(AR) rcs $@ $^
++
++c_bindings/fast_wrapper.o: CXXFLAGS += $(OPT_CFLAGS)
++
+ HW_OBJS = $(addprefix hw/,hw_util.o hw_proof.o hw_interface.o chia_driver.o ftdi_driver.o vdf_driver.o pll_freqs.o) vdf_base.o lzcnt.o
+ EMU_OBJS = hw/emu_funcs.o hw/emu_runner.o
+ HW_LIB = hw/libft4222/build-x86_64/libft4222.so
+diff --git a/src/c_bindings/fast_wrapper.cpp b/src/c_bindings/fast_wrapper.cpp
+new file mode 100644
+index 0000000..198d0a8
+--- /dev/null
++++ b/src/c_bindings/fast_wrapper.cpp
+@@ -0,0 +1,795 @@
++#include "fast_wrapper.h"
++
++#include <atomic>
++#include <chrono>
++#include <limits>
++#include <mutex>
++#include <vector>
++
++#include "../vdf.h"
++#include "../create_discriminant.h"
++
++// Runtime configuration knobs required by `parameters.h`.
++// These are `extern` variables there, but each binary defines them explicitly.
++bool use_divide_table = false;
++int gcd_base_bits = 50;
++int gcd_128_max_iter = 3;
++std::string asmprefix = "cel_";
++bool enable_all_instructions = false;
++
++namespace {
++std::once_flag init_once;
++std::atomic<uint64_t> bucket_memory_budget_bytes(128ULL * 1024ULL * 1024ULL);
++std::atomic<bool> streaming_stats_enabled(false);
++
++struct LastStreamingParameters {
++    uint32_t k = 0;
++    uint32_t l = 0;
++    bool tuned = false;
++    bool set = false;
++};
++
++thread_local LastStreamingParameters last_streaming_parameters;
++
++struct LastStreamingStats {
++    uint64_t checkpoint_total_ns = 0;
++    uint64_t checkpoint_event_total_ns = 0;
++    uint64_t finalize_total_ns = 0;
++    uint64_t checkpoint_calls = 0;
++    uint64_t bucket_updates = 0;
++    bool set = false;
++};
++
++thread_local LastStreamingStats last_streaming_stats;
++
++void init_chiavdf_fast() {
++    init_gmp();
++    set_rounding_mode();
++
++    // Match the vdf_client runtime selection for AVX2.
++    if (hasAVX2()) {
++        gcd_base_bits = 63;
++        gcd_128_max_iter = 2;
++    } else {
++        gcd_base_bits = 50;
++        gcd_128_max_iter = 3;
++    }
++
++    // Ensure we run the one-wesolowski path by default.
++    fast_algorithm = false;
++    two_weso = false;
++    quiet_mode = true;
++}
++
++ChiavdfByteArray empty_result() { return ChiavdfByteArray{nullptr, 0}; }
++
++uint64_t estimate_bucket_form_bytes(size_t discriminant_size_bits) {
++    // Be conservative: class group forms contain 3 GMP-backed integers that
++    // quickly grow to the discriminant size (or beyond) during NUCOMP.
++    //
++    // This estimate is intentionally larger than the raw serialized size to
++    // avoid picking parameters that risk paging/OOM.
++    uint64_t discr_bytes = (static_cast<uint64_t>(discriminant_size_bits) + 7) / 8;
++    uint64_t estimate = discr_bytes * 16;
++    if (estimate < 2048) {
++        estimate = 2048;
++    }
++    return estimate;
++}
++
++bool tune_streaming_parameters(
++    uint64_t num_iterations,
++    size_t discriminant_size_bits,
++    uint64_t memory_budget_bytes,
++    uint32_t& out_l,
++    uint32_t& out_k) {
++    if (memory_budget_bytes == 0) {
++        return false;
++    }
++
++    // Keep headroom for GMP scratch allocations and general process overhead.
++    uint64_t budget = (memory_budget_bytes * 80) / 100;
++    uint64_t bytes_per_form = estimate_bucket_form_bytes(discriminant_size_bits);
++    if (budget < bytes_per_form) {
++        return false;
++    }
++
++    unsigned __int128 best_cost = std::numeric_limits<unsigned __int128>::max();
++    bool found = false;
++
++    // Empirical tuning notes (1024-bit discriminants, AVX2 build):
++    // - Each bucket update (NUCOMP) and each fold unit is ~5µs.
++    // - Per-checkpoint event overhead (SetForm + bookkeeping) is ~0.3µs.
++    //
++    // So checkpoint counts should be weighted much lower than updates/fold.
++    constexpr unsigned __int128 update_weight = 16;
++    constexpr unsigned __int128 fold_weight = 16;
++    constexpr unsigned __int128 checkpoint_weight = 1;
++
++    // Search a small grid of `(k,l)` values. Higher `k` reduces checkpoint work
++    // (~T/k) but increases fold work (~l·2^k) and bucket memory (~l·2^k).
++    for (uint32_t k = 4; k <= 20; k++) {
++        unsigned __int128 buckets_per_row = static_cast<unsigned __int128>(1) << k;
++
++        for (uint32_t l = 1; l <= 64; l++) {
++            unsigned __int128 form_count = buckets_per_row * static_cast<unsigned __int128>(l);
++            unsigned __int128 mem_required =
++                form_count * static_cast<unsigned __int128>(bytes_per_form);
++            if (mem_required > static_cast<unsigned __int128>(budget)) {
++                continue;
++            }
++
++            unsigned __int128 updates = static_cast<unsigned __int128>(
++                (num_iterations + static_cast<uint64_t>(k) - 1) / static_cast<uint64_t>(k));
++            uint64_t kl = static_cast<uint64_t>(k) * static_cast<uint64_t>(l);
++            unsigned __int128 checkpoints = static_cast<unsigned __int128>(
++                (num_iterations + kl - 1) / kl);
++            unsigned __int128 fold = static_cast<unsigned __int128>(l) << (k + 1);
++            unsigned __int128 cost =
++                updates * update_weight + checkpoints * checkpoint_weight + fold * fold_weight;
++
++            if (!found || cost < best_cost) {
++                found = true;
++                best_cost = cost;
++                out_k = k;
++                out_l = l;
++            }
++        }
++    }
++
++    return found;
++}
++
++uint64_t get_block(uint64_t i, uint64_t k, uint64_t T, integer& B) {
++    integer res = FastPow(2, T - k * (i + 1), B);
++    mpz_mul_2exp(res.impl, res.impl, k);
++    res = res / B;
++    auto res_vector = res.to_vector();
++    return res_vector.empty() ? 0 : res_vector[0];
++}
++
++class ProgressOneWesolowskiCallback final : public OneWesolowskiCallback {
++  public:
++    ProgressOneWesolowskiCallback(
++        integer& D,
++        form& f,
++        uint64_t wanted_iter,
++        uint64_t progress_interval,
++        ChiavdfProgressCallback progress_cb,
++        void* progress_user_data)
++        : OneWesolowskiCallback(D, f, wanted_iter),
++          progress_interval(progress_interval),
++          progress_cb(progress_cb),
++          progress_user_data(progress_user_data),
++          next_progress(progress_interval) {}
++
++    void OnIteration(int type, void* data, uint64_t iteration) override {
++        OneWesolowskiCallback::OnIteration(type, data, iteration);
++
++        if (progress_cb == nullptr || progress_interval == 0) {
++            return;
++        }
++
++        uint64_t done = iteration + 1;
++        if (done > wanted_iter) {
++            return;
++        }
++
++        if (done >= next_progress) {
++            progress_cb(next_progress, progress_user_data);
++            next_progress += progress_interval;
++        }
++    }
++
++  private:
++    uint64_t progress_interval;
++    ChiavdfProgressCallback progress_cb;
++    void* progress_user_data;
++    uint64_t next_progress;
++};
++
++class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
++  public:
++    StreamingOneWesolowskiCallback(
++        integer& D,
++        uint64_t wanted_iter,
++        uint32_t k,
++        uint32_t l,
++        uint64_t limit,
++        integer& B,
++        bool use_getblock_opt,
++        uint64_t progress_interval,
++        ChiavdfProgressCallback progress_cb,
++        void* progress_user_data)
++        : WesolowskiCallback(D),
++          wanted_iter(wanted_iter),
++          k(k),
++          l(l),
++          kl(static_cast<uint64_t>(k) * static_cast<uint64_t>(l)),
++          limit(limit),
++          B(B),
++          progress_interval(progress_interval),
++          progress_cb(progress_cb),
++          progress_user_data(progress_user_data),
++          next_progress(progress_interval),
++          use_getblock_opt(use_getblock_opt),
++          stats_enabled(streaming_stats_enabled.load(std::memory_order_relaxed)) {
++        form id = form::identity(D);
++        buckets.resize(static_cast<size_t>(l) * (1ULL << k), id);
++
++        if (use_getblock_opt) {
++            getblock_ok = init_getblock_opt_state();
++        }
++    }
++
++    void OnIteration(int type, void* data, uint64_t iteration) override {
++        iteration++;
++        if (iteration > wanted_iter) {
++            return;
++        }
++
++        if (progress_cb != nullptr && progress_interval != 0 && iteration >= next_progress) {
++            progress_cb(next_progress, progress_user_data);
++            next_progress += progress_interval;
++        }
++
++        if (iteration % kl == 0) {
++            uint64_t pos = iteration / kl;
++            if (pos < limit) {
++                form checkpoint;
++                auto started_at = std::chrono::steady_clock::time_point{};
++                if (stats_enabled) {
++                    started_at = std::chrono::steady_clock::now();
++                }
++                SetForm(type, data, &checkpoint);
++                process_checkpoint(pos, checkpoint, /*record_stats=*/true);
++                if (stats_enabled) {
++                    checkpoint_event_total_ns += static_cast<uint64_t>(
++                        std::chrono::duration_cast<std::chrono::nanoseconds>(
++                            std::chrono::steady_clock::now() - started_at)
++                            .count());
++                }
++            }
++        }
++
++        if (iteration == wanted_iter) {
++            SetForm(type, data, &result);
++            has_result = true;
++        }
++    }
++
++    void process_checkpoint(uint64_t i, const form& checkpoint, bool record_stats) {
++        const bool do_stats = stats_enabled && record_stats;
++        auto started_at = std::chrono::steady_clock::time_point{};
++        if (do_stats) {
++            started_at = std::chrono::steady_clock::now();
++        }
++
++        uint64_t local_updates = 0;
++        for (uint32_t j = 0; j < l; j++) {
++            uint64_t p = i * static_cast<uint64_t>(l) + static_cast<uint64_t>(j);
++            uint64_t needed = static_cast<uint64_t>(k) * (p + 1);
++            if (wanted_iter < needed) {
++                break;
++            }
++            uint64_t b = use_getblock_opt ? get_block_opt(p) : get_block(p, k, wanted_iter, B);
++            if (do_stats) {
++                local_updates++;
++            }
++            nucomp_form(bucket(j, b), bucket(j, b), checkpoint, D, L);
++        }
++
++        if (do_stats) {
++            checkpoint_calls++;
++            bucket_updates += local_updates;
++            checkpoint_total_ns += static_cast<uint64_t>(
++                std::chrono::duration_cast<std::chrono::nanoseconds>(
++                    std::chrono::steady_clock::now() - started_at)
++                    .count());
++        }
++    }
++
++    bool init_ok() const { return getblock_ok; }
++
++    bool ok() const { return has_result; }
++
++    const form& y() const { return result; }
++
++    form finalize_proof() {
++        auto started_at = std::chrono::steady_clock::time_point{};
++        if (stats_enabled) {
++            started_at = std::chrono::steady_clock::now();
++        }
++
++        PulmarkReducer reducer;
++        form id = form::identity(D);
++
++        uint64_t k1 = k / 2;
++        uint64_t k0 = k - k1;
++        form x = id;
++
++        for (int64_t j = static_cast<int64_t>(l) - 1; j >= 0; j--) {
++            x = FastPowFormNucomp(x, D, integer(static_cast<uint64_t>(1) << k), L, reducer);
++
++            for (uint64_t b1 = 0; b1 < (1ULL << k1); b1++) {
++                form z = id;
++                for (uint64_t b0 = 0; b0 < (1ULL << k0); b0++) {
++                    nucomp_form(z, z, bucket(static_cast<uint32_t>(j), b1 * (1ULL << k0) + b0), D, L);
++                }
++                z = FastPowFormNucomp(
++                    z,
++                    D,
++                    integer(static_cast<uint64_t>(b1 * (1ULL << k0))),
++                    L,
++                    reducer);
++                nucomp_form(x, x, z, D, L);
++            }
++
++            for (uint64_t b0 = 0; b0 < (1ULL << k0); b0++) {
++                form z = id;
++                for (uint64_t b1 = 0; b1 < (1ULL << k1); b1++) {
++                    nucomp_form(z, z, bucket(static_cast<uint32_t>(j), b1 * (1ULL << k0) + b0), D, L);
++                }
++                z = FastPowFormNucomp(z, D, integer(b0), L, reducer);
++                nucomp_form(x, x, z, D, L);
++            }
++        }
++
++        reducer.reduce(x);
++
++        if (stats_enabled) {
++            finalize_total_ns += static_cast<uint64_t>(
++                std::chrono::duration_cast<std::chrono::nanoseconds>(
++                    std::chrono::steady_clock::now() - started_at)
++                    .count());
++        }
++        return x;
++    }
++
++    bool stats_ok() const { return stats_enabled; }
++
++    LastStreamingStats stats() const {
++        LastStreamingStats out;
++        out.checkpoint_total_ns = checkpoint_total_ns;
++        out.checkpoint_event_total_ns = checkpoint_event_total_ns;
++        out.finalize_total_ns = finalize_total_ns;
++        out.checkpoint_calls = checkpoint_calls;
++        out.bucket_updates = bucket_updates;
++        out.set = stats_enabled;
++        return out;
++    }
++
++  private:
++    form& bucket(uint32_t j, uint64_t b) {
++        size_t idx = static_cast<size_t>(j) * (1ULL << k) + static_cast<size_t>(b);
++        return buckets[idx];
++    }
++
++    const form& bucket(uint32_t j, uint64_t b) const {
++        size_t idx = static_cast<size_t>(j) * (1ULL << k) + static_cast<size_t>(b);
++        return buckets[idx];
++    }
++
++    uint64_t wanted_iter;
++    uint32_t k;
++    uint32_t l;
++    uint64_t kl;
++    uint64_t limit;
++    integer B;
++    uint64_t progress_interval;
++    ChiavdfProgressCallback progress_cb;
++    void* progress_user_data;
++    uint64_t next_progress;
++
++    std::vector<form> buckets;
++    form result;
++    bool has_result = false;
++
++    bool use_getblock_opt;
++    bool getblock_ok = true;
++    uint64_t getblock_next_p = 0;
++    integer getblock_inv_2k;
++    integer getblock_r;
++    integer getblock_tmp;
++
++    bool stats_enabled;
++    uint64_t checkpoint_total_ns = 0;
++    uint64_t checkpoint_event_total_ns = 0;
++    uint64_t finalize_total_ns = 0;
++    uint64_t checkpoint_calls = 0;
++    uint64_t bucket_updates = 0;
++
++    bool init_getblock_opt_state() {
++        if (k == 0) {
++            return false;
++        }
++        uint64_t k_u64 = static_cast<uint64_t>(k);
++        if (wanted_iter < k_u64) {
++            return true;
++        }
++
++        integer two_k_mod = FastPow(2, k_u64, B);
++        if (mpz_invert(getblock_inv_2k.impl, two_k_mod.impl, B.impl) == 0) {
++            return false;
++        }
++
++        getblock_r = FastPow(2, wanted_iter - k_u64, B);
++        getblock_next_p = 0;
++        return true;
++    }
++
++    uint64_t get_block_opt(uint64_t p) {
++        if (!getblock_ok || wanted_iter < static_cast<uint64_t>(k)) {
++            return get_block(p, k, wanted_iter, B);
++        }
++
++        // Expected call pattern is sequential `p`. If we ever get out of sync,
++        // advance state forward or fall back to the slow mapping.
++        if (p < getblock_next_p) {
++            return get_block(p, k, wanted_iter, B);
++        }
++        while (getblock_next_p < p) {
++            mpz_mul(getblock_r.impl, getblock_r.impl, getblock_inv_2k.impl);
++            mpz_mod(getblock_r.impl, getblock_r.impl, B.impl);
++            getblock_next_p++;
++        }
++
++        mpz_mul_2exp(getblock_tmp.impl, getblock_r.impl, k);
++        mpz_fdiv_q(getblock_tmp.impl, getblock_tmp.impl, B.impl);
++        uint64_t b = mpz_get_ui(getblock_tmp.impl);
++
++        mpz_mul(getblock_r.impl, getblock_r.impl, getblock_inv_2k.impl);
++        mpz_mod(getblock_r.impl, getblock_r.impl, B.impl);
++        getblock_next_p++;
++
++        return b;
++    }
++};
++
++ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_impl(
++    const uint8_t* challenge_hash,
++    size_t challenge_size,
++    const uint8_t* x_s,
++    size_t x_s_size,
++    const uint8_t* y_ref_s,
++    size_t y_ref_s_size,
++    size_t discriminant_size_bits,
++    uint64_t num_iterations,
++    uint64_t progress_interval,
++    ChiavdfProgressCallback progress_cb,
++    void* progress_user_data,
++    bool use_getblock_opt) {
++    std::call_once(init_once, init_chiavdf_fast);
++
++    last_streaming_stats = LastStreamingStats{};
++
++    if (challenge_hash == nullptr || challenge_size == 0 || x_s == nullptr || x_s_size == 0 ||
++        y_ref_s == nullptr || y_ref_s_size == 0) {
++        return empty_result();
++    }
++    if (num_iterations == 0) {
++        return empty_result();
++    }
++
++    std::vector<uint8_t> challenge_hash_bytes(challenge_hash, challenge_hash + challenge_size);
++    integer D = CreateDiscriminant(challenge_hash_bytes, static_cast<int>(discriminant_size_bits));
++    integer L = root(-D, 4);
++
++    form x = DeserializeForm(D, x_s, x_s_size);
++    form y_ref = DeserializeForm(D, y_ref_s, y_ref_s_size);
++
++    uint32_t k;
++    uint32_t l;
++    bool tuned = false;
++    const uint64_t budget =
++        bucket_memory_budget_bytes.load(std::memory_order_relaxed);
++    if (num_iterations >= (1 << 16)) {
++        tuned = tune_streaming_parameters(num_iterations, discriminant_size_bits, budget, l, k);
++    }
++    if (!tuned) {
++        if (num_iterations >= (1 << 16)) {
++            ApproximateParameters(num_iterations, l, k);
++        } else {
++            k = 10;
++            l = 1;
++        }
++    }
++    if (k == 0) {
++        k = 1;
++    }
++    if (l == 0) {
++        l = 1;
++    }
++
++    last_streaming_parameters.k = k;
++    last_streaming_parameters.l = l;
++    last_streaming_parameters.tuned = tuned;
++    last_streaming_parameters.set = true;
++
++    uint64_t kl = static_cast<uint64_t>(k) * static_cast<uint64_t>(l);
++    uint64_t limit = num_iterations / kl;
++    if (num_iterations % kl) {
++        limit++;
++    }
++
++    integer B = GetB(D, x, y_ref);
++
++    std::atomic<bool> stopped(false);
++    StreamingOneWesolowskiCallback weso(
++        D,
++        num_iterations,
++        k,
++        l,
++        limit,
++        B,
++        use_getblock_opt,
++        progress_interval,
++        progress_cb,
++        progress_user_data);
++
++    if (!weso.init_ok()) {
++        return empty_result();
++    }
++
++    weso.process_checkpoint(/*i=*/0, x, /*record_stats=*/false);
++
++    FastStorage* fast_storage = nullptr;
++    repeated_square(num_iterations, x, D, L, &weso, fast_storage, stopped);
++
++    if (!weso.ok()) {
++        return empty_result();
++    }
++    if (!(weso.y() == y_ref)) {
++        return empty_result();
++    }
++
++    form proof_form = weso.finalize_proof();
++
++    if (weso.stats_ok()) {
++        last_streaming_stats = weso.stats();
++    }
++
++    int d_bits = D.num_bits();
++    std::vector<unsigned char> y_serialized = SerializeForm(y_ref, d_bits);
++    std::vector<unsigned char> proof_serialized = SerializeForm(proof_form, d_bits);
++
++    if (y_serialized.empty() || proof_serialized.empty()) {
++        return empty_result();
++    }
++
++    const size_t total = y_serialized.size() + proof_serialized.size();
++    uint8_t* out = new uint8_t[total];
++    std::copy(y_serialized.begin(), y_serialized.end(), out);
++    std::copy(proof_serialized.begin(), proof_serialized.end(), out + y_serialized.size());
++    return ChiavdfByteArray{out, total};
++}
++} // namespace
++
++extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast(
++    const uint8_t* challenge_hash,
++    size_t challenge_size,
++    const uint8_t* x_s,
++    size_t x_s_size,
++    size_t discriminant_size_bits,
++    uint64_t num_iterations) {
++    return chiavdf_prove_one_weso_fast_with_progress(
++        challenge_hash,
++        challenge_size,
++        x_s,
++        x_s_size,
++        discriminant_size_bits,
++        num_iterations,
++        /*progress_interval=*/0,
++        /*progress_cb=*/nullptr,
++        /*progress_user_data=*/nullptr);
++}
++
++extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_with_progress(
++    const uint8_t* challenge_hash,
++    size_t challenge_size,
++    const uint8_t* x_s,
++    size_t x_s_size,
++    size_t discriminant_size_bits,
++    uint64_t num_iterations,
++    uint64_t progress_interval,
++    ChiavdfProgressCallback progress_cb,
++    void* progress_user_data) {
++    try {
++        std::call_once(init_once, init_chiavdf_fast);
++
++        if (challenge_hash == nullptr || challenge_size == 0 || x_s == nullptr || x_s_size == 0) {
++            return empty_result();
++        }
++        if (num_iterations == 0) {
++            return empty_result();
++        }
++
++        std::vector<uint8_t> challenge_hash_bytes(challenge_hash, challenge_hash + challenge_size);
++        integer D = CreateDiscriminant(challenge_hash_bytes, static_cast<int>(discriminant_size_bits));
++        integer L = root(-D, 4);
++
++        form x = DeserializeForm(D, x_s, x_s_size);
++
++        std::atomic<bool> stopped(false);
++        ProgressOneWesolowskiCallback weso(
++            D,
++            x,
++            num_iterations,
++            progress_interval,
++            progress_cb,
++            progress_user_data);
++
++        // Run the fast repeated-squaring engine to `num_iterations`.
++        // The callback stores all intermediates needed for the proof.
++        FastStorage* fast_storage = nullptr;
++        repeated_square(num_iterations, x, D, L, &weso, fast_storage, stopped);
++
++        // Now generate the compact proof from the stored intermediates.
++        Proof proof = ProveOneWesolowski(num_iterations, D, x, &weso, stopped);
++        if (proof.y.empty() || proof.proof.empty()) {
++            return empty_result();
++        }
++
++        const size_t total = proof.y.size() + proof.proof.size();
++        uint8_t* out = new uint8_t[total];
++        std::copy(proof.y.begin(), proof.y.end(), out);
++        std::copy(proof.proof.begin(), proof.proof.end(), out + proof.y.size());
++        return ChiavdfByteArray{out, total};
++    } catch (...) {
++        return empty_result();
++    }
++}
++
++extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming(
++    const uint8_t* challenge_hash,
++    size_t challenge_size,
++    const uint8_t* x_s,
++    size_t x_s_size,
++    const uint8_t* y_ref_s,
++    size_t y_ref_s_size,
++    size_t discriminant_size_bits,
++    uint64_t num_iterations) {
++    return chiavdf_prove_one_weso_fast_streaming_with_progress(
++        challenge_hash,
++        challenge_size,
++        x_s,
++        x_s_size,
++        y_ref_s,
++        y_ref_s_size,
++        discriminant_size_bits,
++        num_iterations,
++        /*progress_interval=*/0,
++        /*progress_cb=*/nullptr,
++        /*progress_user_data=*/nullptr);
++}
++
++extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_with_progress(
++    const uint8_t* challenge_hash,
++    size_t challenge_size,
++    const uint8_t* x_s,
++    size_t x_s_size,
++    const uint8_t* y_ref_s,
++    size_t y_ref_s_size,
++    size_t discriminant_size_bits,
++    uint64_t num_iterations,
++    uint64_t progress_interval,
++    ChiavdfProgressCallback progress_cb,
++    void* progress_user_data) {
++    try {
++        return chiavdf_prove_one_weso_fast_streaming_impl(
++            challenge_hash,
++            challenge_size,
++            x_s,
++            x_s_size,
++            y_ref_s,
++            y_ref_s_size,
++            discriminant_size_bits,
++            num_iterations,
++            progress_interval,
++            progress_cb,
++            progress_user_data,
++            /*use_getblock_opt=*/false);
++    } catch (...) {
++        return empty_result();
++    }
++}
++
++extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt(
++    const uint8_t* challenge_hash,
++    size_t challenge_size,
++    const uint8_t* x_s,
++    size_t x_s_size,
++    const uint8_t* y_ref_s,
++    size_t y_ref_s_size,
++    size_t discriminant_size_bits,
++    uint64_t num_iterations) {
++    return chiavdf_prove_one_weso_fast_streaming_getblock_opt_with_progress(
++        challenge_hash,
++        challenge_size,
++        x_s,
++        x_s_size,
++        y_ref_s,
++        y_ref_s_size,
++        discriminant_size_bits,
++        num_iterations,
++        /*progress_interval=*/0,
++        /*progress_cb=*/nullptr,
++        /*progress_user_data=*/nullptr);
++}
++
++extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt_with_progress(
++    const uint8_t* challenge_hash,
++    size_t challenge_size,
++    const uint8_t* x_s,
++    size_t x_s_size,
++    const uint8_t* y_ref_s,
++    size_t y_ref_s_size,
++    size_t discriminant_size_bits,
++    uint64_t num_iterations,
++    uint64_t progress_interval,
++    ChiavdfProgressCallback progress_cb,
++    void* progress_user_data) {
++    try {
++        return chiavdf_prove_one_weso_fast_streaming_impl(
++            challenge_hash,
++            challenge_size,
++            x_s,
++            x_s_size,
++            y_ref_s,
++            y_ref_s_size,
++            discriminant_size_bits,
++            num_iterations,
++            progress_interval,
++            progress_cb,
++            progress_user_data,
++            /*use_getblock_opt=*/true);
++    } catch (...) {
++        return empty_result();
++    }
++}
++
++extern "C" void chiavdf_set_bucket_memory_budget_bytes(uint64_t bytes) {
++    bucket_memory_budget_bytes.store(bytes, std::memory_order_relaxed);
++}
++
++extern "C" void chiavdf_set_enable_streaming_stats(bool enable) {
++    streaming_stats_enabled.store(enable, std::memory_order_relaxed);
++    last_streaming_stats = LastStreamingStats{};
++}
++
++extern "C" bool chiavdf_get_last_streaming_parameters(uint32_t* out_k, uint32_t* out_l, bool* out_tuned) {
++    if (out_k == nullptr || out_l == nullptr || out_tuned == nullptr) {
++        return false;
++    }
++    if (!last_streaming_parameters.set) {
++        return false;
++    }
++    *out_k = last_streaming_parameters.k;
++    *out_l = last_streaming_parameters.l;
++    *out_tuned = last_streaming_parameters.tuned;
++    return true;
++}
++
++extern "C" bool chiavdf_get_last_streaming_stats(
++    uint64_t* out_checkpoint_total_ns,
++    uint64_t* out_checkpoint_event_total_ns,
++    uint64_t* out_finalize_total_ns,
++    uint64_t* out_checkpoint_calls,
++    uint64_t* out_bucket_updates) {
++    if (out_checkpoint_total_ns == nullptr || out_checkpoint_event_total_ns == nullptr ||
++        out_finalize_total_ns == nullptr || out_checkpoint_calls == nullptr ||
++        out_bucket_updates == nullptr) {
++        return false;
++    }
++    if (!last_streaming_stats.set) {
++        return false;
++    }
++    *out_checkpoint_total_ns = last_streaming_stats.checkpoint_total_ns;
++    *out_checkpoint_event_total_ns = last_streaming_stats.checkpoint_event_total_ns;
++    *out_finalize_total_ns = last_streaming_stats.finalize_total_ns;
++    *out_checkpoint_calls = last_streaming_stats.checkpoint_calls;
++    *out_bucket_updates = last_streaming_stats.bucket_updates;
++    return true;
++}
++
++extern "C" void chiavdf_free_byte_array(ChiavdfByteArray array) { delete[] array.data; }
+diff --git a/src/c_bindings/fast_wrapper.h b/src/c_bindings/fast_wrapper.h
+new file mode 100644
+index 0000000..bf33f32
+--- /dev/null
++++ b/src/c_bindings/fast_wrapper.h
+@@ -0,0 +1,145 @@
++#pragma once
++
++#include <stdbool.h>
++#include <stddef.h>
++#include <stdint.h>
++
++#ifdef __cplusplus
++extern "C" {
++#endif
++
++typedef struct {
++    uint8_t* data;
++    size_t length;
++} ChiavdfByteArray;
++
++typedef void (*ChiavdfProgressCallback)(uint64_t iters_done, void* user_data);
++
++// Configure the per-process memory budget used by the parameter tuner when
++// selecting `(k,l)` for streaming/bucket-based proving.
++//
++// The budget is per worker process (not global across multiple processes).
++//
++// If `bytes` is 0, the default chiavdf heuristic is used.
++void chiavdf_set_bucket_memory_budget_bytes(uint64_t bytes);
++
++// Debug helper: returns the `(k,l)` parameters selected for the most recent
++// streaming proof computed on the current thread.
++//
++// Returns true if parameters are available.
++bool chiavdf_get_last_streaming_parameters(uint32_t* out_k, uint32_t* out_l, bool* out_tuned);
++
++// Enable lightweight timing counters for the streaming prover.
++//
++// When enabled, the native library records basic timing counters for the most
++// recent streaming proof computed on the current thread. This is intended for
++// benchmarking and tuning; production runs should keep this disabled to avoid
++// extra overhead.
++void chiavdf_set_enable_streaming_stats(bool enable);
++
++// Debug helper: returns timing counters for the most recent streaming proof on
++// the current thread.
++//
++// Returns true if stats are available (i.e. stats enabled and a streaming proof
++// was computed successfully).
++bool chiavdf_get_last_streaming_stats(
++    uint64_t* out_checkpoint_total_ns,
++    uint64_t* out_checkpoint_event_total_ns,
++    uint64_t* out_finalize_total_ns,
++    uint64_t* out_checkpoint_calls,
++    uint64_t* out_bucket_updates);
++
++// Computes a compact (witness_type=0) Wesolowski proof using the fast engine.
++//
++// On success, returns `y || proof` where:
++// - `y` is the serialized output form (typically 100 bytes for 1024-bit discriminants)
++// - `proof` is the serialized witness form (same size as `y`)
++//
++// On failure, returns `{NULL, 0}`.
++ChiavdfByteArray chiavdf_prove_one_weso_fast(
++    const uint8_t* challenge_hash,
++    size_t challenge_size,
++    const uint8_t* x_s,
++    size_t x_s_size,
++    size_t discriminant_size_bits,
++    uint64_t num_iterations);
++
++// Same as `chiavdf_prove_one_weso_fast`, but optionally invokes `progress_cb` from
++// the proving thread every `progress_interval` iterations completed.
++//
++// If `progress_cb` is NULL or `progress_interval` is 0, no progress is reported.
++ChiavdfByteArray chiavdf_prove_one_weso_fast_with_progress(
++    const uint8_t* challenge_hash,
++    size_t challenge_size,
++    const uint8_t* x_s,
++    size_t x_s_size,
++    size_t discriminant_size_bits,
++    uint64_t num_iterations,
++    uint64_t progress_interval,
++    ChiavdfProgressCallback progress_cb,
++    void* progress_user_data);
++
++// Computes a compact (witness_type=0) Wesolowski proof using the "streaming"
++// bucket-accumulation algorithm (Trick 1), which requires the expected output
++// `y_ref` up front (as used by bluebox compaction jobs).
++//
++// On success, returns `y || proof` (same format as `chiavdf_prove_one_weso_fast`).
++ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming(
++    const uint8_t* challenge_hash,
++    size_t challenge_size,
++    const uint8_t* x_s,
++    size_t x_s_size,
++    const uint8_t* y_ref_s,
++    size_t y_ref_s_size,
++    size_t discriminant_size_bits,
++    uint64_t num_iterations);
++
++// Same as `chiavdf_prove_one_weso_fast_streaming`, but optionally invokes
++// `progress_cb` from the proving thread every `progress_interval` iterations.
++ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_with_progress(
++    const uint8_t* challenge_hash,
++    size_t challenge_size,
++    const uint8_t* x_s,
++    size_t x_s_size,
++    const uint8_t* y_ref_s,
++    size_t y_ref_s_size,
++    size_t discriminant_size_bits,
++    uint64_t num_iterations,
++    uint64_t progress_interval,
++    ChiavdfProgressCallback progress_cb,
++    void* progress_user_data);
++
++// Same as `chiavdf_prove_one_weso_fast_streaming`, but with an optimized
++// implementation of the `GetBlock()` mapping (avoids per-block modular
++// exponentiation without allocating a full `GetBlock` table).
++ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt(
++    const uint8_t* challenge_hash,
++    size_t challenge_size,
++    const uint8_t* x_s,
++    size_t x_s_size,
++    const uint8_t* y_ref_s,
++    size_t y_ref_s_size,
++    size_t discriminant_size_bits,
++    uint64_t num_iterations);
++
++// Same as `chiavdf_prove_one_weso_fast_streaming_getblock_opt`, but optionally
++// invokes `progress_cb` from the proving thread every `progress_interval`
++// iterations.
++ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt_with_progress(
++    const uint8_t* challenge_hash,
++    size_t challenge_size,
++    const uint8_t* x_s,
++    size_t x_s_size,
++    const uint8_t* y_ref_s,
++    size_t y_ref_s_size,
++    size_t discriminant_size_bits,
++    uint64_t num_iterations,
++    uint64_t progress_interval,
++    ChiavdfProgressCallback progress_cb,
++    void* progress_user_data);
++
++void chiavdf_free_byte_array(ChiavdfByteArray array);
++
++#ifdef __cplusplus
++}
++#endif
+diff --git a/src/threading.h b/src/threading.h
+index 50d4b49..f6344ad 100644
+--- a/src/threading.h
++++ b/src/threading.h
+@@ -564,8 +564,8 @@ struct alignas(64) thread_counter {
+     }
+ };
+ 
+-thread_counter master_counter[100];
+-thread_counter slave_counter[100];
++thread_counter master_counter[512];
++thread_counter slave_counter[512];
+ 
+ struct thread_state {
+     int pairindex;
+diff --git a/src/vdf.h b/src/vdf.h
+index 9ab4aef..4544fe2 100644
+--- a/src/vdf.h
++++ b/src/vdf.h
+@@ -78,6 +78,18 @@ std::mutex new_event_mutex, cout_lock;
+ bool debug_mode = false;
+ bool fast_algorithm = false;
+ bool two_weso = false;
++bool quiet_mode = false;
++
++// vdf_fast uses shared master/slave counters keyed by `square_state.pairindex`.
++// The upstream chiavdf binaries run one VDF per process and hardcode `pairindex=0`.
++// In embedded/multi-worker setups (like WesoForge), multiple VDF computations can
++// run concurrently in the same process; they must not share a pairindex.
++inline int vdf_fast_pairindex() {
++    constexpr int kSlots = int(sizeof(master_counter) / sizeof(master_counter[0]));
++    static std::atomic<int> next_slot{0};
++    thread_local int slot = next_slot.fetch_add(1, std::memory_order_relaxed) % kSlots;
++    return slot;
++}
+ 
+ //always works
+ void repeated_square_original(vdf_original &vdfo, form& f, const integer& D, const integer& L, uint64 base, uint64 iterations, INUDUPLListener *nuduplListener) {
+@@ -137,7 +149,7 @@ void repeated_square(uint64_t iterations, form f, const integer& D, const intege
+ 
+         // This works single threaded
+         square_state_type square_state;
+-        square_state.pairindex=0;
++        square_state.pairindex=vdf_fast_pairindex();
+ 
+         uint64 actual_iterations=repeated_square_fast(square_state, f, D, L, num_iterations, batch_size, weso);
+ 
+@@ -236,10 +248,12 @@ void repeated_square(uint64_t iterations, form f, const integer& D, const intege
+             }
+         #endif
+     }
+-    {
+-        // this shouldn't be needed but avoids some false positive in TSAN
+-        std::lock_guard<std::mutex> lk(cout_lock);
+-        std::cout << "VDF loop finished. Total iters: " << num_iterations << "\n" << std::flush;
++    if (!quiet_mode) {
++        {
++            // this shouldn't be needed but avoids some false positive in TSAN
++            std::lock_guard<std::mutex> lk(cout_lock);
++            std::cout << "VDF loop finished. Total iters: " << num_iterations << "\n" << std::flush;
++        }
+     }
+ 
+     #ifdef VDF_TEST
+@@ -275,11 +289,6 @@ Proof ProveOneWesolowski(uint64_t iters, integer& D, form f, OneWesolowskiCallba
+     proof_serialized = SerializeForm(proof_form, d_bits);
+     Proof proof(y_serialized, proof_serialized);
+     proof.witness_type = 0;
+-    {
+-        // this shouldn't be needed but avoids some false positive in TSAN
+-        std::lock_guard<std::mutex> lk(cout_lock);
+-        std::cout << "Got simple weso proof: " << proof.hex() << "\n";
+-    }
+     return proof;
+ }
+ 
+diff --git a/docs/bluebox_compaction.md b/docs/bluebox_compaction.md
+new file mode 100644
+index 0000000..61cd1fd
+--- /dev/null
++++ b/docs/bluebox_compaction.md
+@@ -0,0 +1,49 @@
++# Bluebox Compaction Optimizations
++
++This document describes the compaction-oriented proving path exposed by
++`src/c_bindings/fast_wrapper.h` and implemented in
++`src/c_bindings/fast_wrapper.cpp`.
++
++## Scope
++
++These APIs are intended for workloads where the expected VDF output (`y_ref`) is
++already known up front (for example, bluebox compaction jobs). They are additive
++and do not change the existing `c_wrapper` APIs.
++
++## Optimization 1: Streaming one-wesolowski
++
++Given `y_ref`, the prover computes:
++
++- `B = GetB(D, x, y_ref)` before squaring starts
++
++This enables a streaming algorithm that updates proof buckets at each
++checkpoint during repeated squaring, instead of materializing the full
++intermediate checkpoint array and scanning it after the loop. In practice this
++substantially reduces memory usage for compaction workloads.
++
++## Optimization 2: Incremental GetBlock mapping
++
++For streaming checkpoint updates, bucket index selection repeatedly calls
++`GetBlock(p, k, T, B)`. The optimized mode keeps a rolling modular state and
++advances sequential `p` values incrementally, avoiding full modular
++exponentiation per call and avoiding a large lookup table.
++
++## Optimization 3: Memory-budgeted (k, l) tuning
++
++The wrapper can tune `(k, l)` under a configured memory budget:
++
++- `chiavdf_set_bucket_memory_budget_bytes(...)`
++
++If no tuned candidate is found, the code falls back to the standard parameter
++heuristics.
++
++## Operational Notes
++
++- The `fast_wrapper` code path sets one-wesolowski mode and uses `quiet_mode` to
++  avoid unsolicited stdout noise when embedded in multi-worker clients.
++- Thread-slot assignment for the fast VDF counters is per-thread via
++  `vdf_fast_pairindex()`, avoiding slot collisions when multiple VDF computations
++  run in one process.
++- The production default for `enable_threads` in `parameters.h` is unchanged from
++  upstream to preserve timelord expectations.
++
diff --git a/src/c_bindings/fast_wrapper.cpp b/src/c_bindings/fast_wrapper.cpp
index 61d24599..d660ee80 100644
--- a/src/c_bindings/fast_wrapper.cpp
+++ b/src/c_bindings/fast_wrapper.cpp
@@ -1,7 +1,9 @@
 #include "fast_wrapper.h"
 
 #include <atomic>
+#include <cassert>
 #include <chrono>
+#include <cstdio>
 #include <limits>
 #include <mutex>
 #include <vector>
@@ -63,6 +65,39 @@ void init_chiavdf_fast() {
 
 ChiavdfByteArray empty_result() { return ChiavdfByteArray{nullptr, 0}; }
 
+uint64_t saturating_add_u64(uint64_t lhs, uint64_t rhs) {
+    if (lhs > std::numeric_limits<uint64_t>::max() - rhs) {
+        return std::numeric_limits<uint64_t>::max();
+    }
+    return lhs + rhs;
+}
+
+void free_byte_array_batch_internal(ChiavdfByteArray* arrays, size_t count) {
+    if (arrays == nullptr) {
+        return;
+    }
+    for (size_t idx = 0; idx < count; ++idx) {
+        delete[] arrays[idx].data;
+        arrays[idx].data = nullptr;
+        arrays[idx].length = 0;
+    }
+    delete[] arrays;
+}
+
+struct BatchProgressContext {
+    uint64_t completed_before = 0;
+    ChiavdfProgressCallback progress_cb = nullptr;
+    void* progress_user_data = nullptr;
+};
+
+void batch_progress_trampoline(uint64_t iters_done, void* user_data) {
+    auto* ctx = static_cast<BatchProgressContext*>(user_data);
+    if (ctx == nullptr || ctx->progress_cb == nullptr) {
+        return;
+    }
+    ctx->progress_cb(saturating_add_u64(ctx->completed_before, iters_done), ctx->progress_user_data);
+}
+
 uint64_t estimate_bucket_form_bytes(size_t discriminant_size_bits) {
     // Be conservative: class group forms contain 3 GMP-backed integers that
     // quickly grow to the discriminant size (or beyond) during NUCOMP.
@@ -96,6 +131,13 @@ bool tune_streaming_parameters(
 
     unsigned __int128 best_cost = std::numeric_limits<unsigned __int128>::max();
     bool found = false;
+#ifndef NDEBUG
+    uint32_t best_k = 0;
+    uint32_t best_l = 0;
+    unsigned __int128 best_updates = 0;
+    unsigned __int128 best_checkpoints = 0;
+    unsigned __int128 best_fold = 0;
+#endif
 
     // Empirical tuning notes (1024-bit discriminants, AVX2 build):
     // - Each bucket update (NUCOMP) and each fold unit is ~5µs.
@@ -133,10 +175,42 @@ bool tune_streaming_parameters(
                 best_cost = cost;
                 out_k = k;
                 out_l = l;
+#ifndef NDEBUG
+                best_k = k;
+                best_l = l;
+                best_updates = updates;
+                best_checkpoints = checkpoints;
+                best_fold = fold;
+#endif
             }
         }
     }
 
+#ifndef NDEBUG
+    if (found) {
+        assert(best_k >= 4 && best_k <= 20);
+        assert(best_l >= 1 && best_l <= 64);
+        std::fprintf(
+            stderr,
+            "[chiavdf] tune_streaming_parameters: T=%llu, budget=%llu, selected=(k=%u,l=%u), "
+            "components{updates=%llu, checkpoints=%llu, fold=%llu}, weights{u=16,c=1,f=16}\n",
+            static_cast<unsigned long long>(num_iterations),
+            static_cast<unsigned long long>(memory_budget_bytes),
+            best_k,
+            best_l,
+            static_cast<unsigned long long>(best_updates),
+            static_cast<unsigned long long>(best_checkpoints),
+            static_cast<unsigned long long>(best_fold));
+        if (best_k == 20 && num_iterations < (1ULL << 24)) {
+            std::fprintf(
+                stderr,
+                "[chiavdf] tune_streaming_parameters: high-k selection for moderate T "
+                "(k=20, T=%llu); verify measured update/fold timing assumptions.\n",
+                static_cast<unsigned long long>(num_iterations));
+        }
+    }
+#endif
+
     return found;
 }
 
@@ -331,6 +405,7 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
         }
     }
 
+  public:
     bool init_ok() const { return getblock_ok; }
 
     bool ok() const { return has_result; }
@@ -401,6 +476,7 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
         return out;
     }
 
+  private:
     form& bucket(uint32_t j, uint64_t b) {
         size_t idx = static_cast<size_t>(j) * (1ULL << k) + static_cast<size_t>(b);
         return buckets[idx];
@@ -836,4 +912,96 @@ extern "C" bool chiavdf_get_last_streaming_stats(
     return true;
 }
 
+extern "C" ChiavdfByteArray* chiavdf_prove_one_weso_fast_streaming_getblock_opt_batch_with_progress(
+    const uint8_t* challenge_hash,
+    size_t challenge_size,
+    const uint8_t* x_s,
+    size_t x_s_size,
+    size_t discriminant_size_bits,
+    const ChiavdfBatchJob* jobs,
+    size_t job_count,
+    uint64_t progress_interval,
+    ChiavdfProgressCallback progress_cb,
+    void* progress_user_data) {
+    if (challenge_hash == nullptr || challenge_size == 0 || x_s == nullptr || x_s_size == 0) {
+        return nullptr;
+    }
+    if (discriminant_size_bits == 0 || jobs == nullptr || job_count == 0) {
+        return nullptr;
+    }
+
+    ChiavdfByteArray* out_arrays = nullptr;
+    try {
+        out_arrays = new ChiavdfByteArray[job_count];
+        for (size_t idx = 0; idx < job_count; ++idx) {
+            out_arrays[idx] = empty_result();
+        }
+
+        uint64_t completed_iters = 0;
+        for (size_t idx = 0; idx < job_count; ++idx) {
+            const ChiavdfBatchJob& job = jobs[idx];
+            if (job.y_ref_s == nullptr || job.y_ref_s_size == 0 || job.num_iterations == 0) {
+                free_byte_array_batch_internal(out_arrays, job_count);
+                return nullptr;
+            }
+
+            BatchProgressContext progress_ctx;
+            progress_ctx.completed_before = completed_iters;
+            progress_ctx.progress_cb = progress_cb;
+            progress_ctx.progress_user_data = progress_user_data;
+            const bool use_progress = progress_cb != nullptr && progress_interval != 0;
+
+            out_arrays[idx] = chiavdf_prove_one_weso_fast_streaming_getblock_opt_with_progress(
+                challenge_hash,
+                challenge_size,
+                x_s,
+                x_s_size,
+                job.y_ref_s,
+                job.y_ref_s_size,
+                discriminant_size_bits,
+                job.num_iterations,
+                progress_interval,
+                use_progress ? batch_progress_trampoline : nullptr,
+                use_progress ? static_cast<void*>(&progress_ctx) : nullptr);
+
+            if (out_arrays[idx].data == nullptr || out_arrays[idx].length == 0) {
+                free_byte_array_batch_internal(out_arrays, job_count);
+                return nullptr;
+            }
+
+            completed_iters = saturating_add_u64(completed_iters, job.num_iterations);
+        }
+
+        return out_arrays;
+    } catch (...) {
+        free_byte_array_batch_internal(out_arrays, job_count);
+        return nullptr;
+    }
+}
+
+extern "C" ChiavdfByteArray* chiavdf_prove_one_weso_fast_streaming_getblock_opt_batch(
+    const uint8_t* challenge_hash,
+    size_t challenge_size,
+    const uint8_t* x_s,
+    size_t x_s_size,
+    size_t discriminant_size_bits,
+    const ChiavdfBatchJob* jobs,
+    size_t job_count) {
+    return chiavdf_prove_one_weso_fast_streaming_getblock_opt_batch_with_progress(
+        challenge_hash,
+        challenge_size,
+        x_s,
+        x_s_size,
+        discriminant_size_bits,
+        jobs,
+        job_count,
+        /*progress_interval=*/0,
+        /*progress_cb=*/nullptr,
+        /*progress_user_data=*/nullptr);
+}
+
+extern "C" void chiavdf_free_byte_array_batch(ChiavdfByteArray* arrays, size_t count) {
+    free_byte_array_batch_internal(arrays, count);
+}
+
 extern "C" void chiavdf_free_byte_array(ChiavdfByteArray array) { delete[] array.data; }
diff --git a/src/c_bindings/fast_wrapper.h b/src/c_bindings/fast_wrapper.h
index bf33f320..115c3abd 100644
--- a/src/c_bindings/fast_wrapper.h
+++ b/src/c_bindings/fast_wrapper.h
@@ -13,6 +13,12 @@ typedef struct {
     size_t length;
 } ChiavdfByteArray;
 
+typedef struct {
+    const uint8_t* y_ref_s;
+    size_t y_ref_s_size;
+    uint64_t num_iterations;
+} ChiavdfBatchJob;
+
 typedef void (*ChiavdfProgressCallback)(uint64_t iters_done, void* user_data);
 
 // Configure the per-process memory budget used by the parameter tuner when
@@ -138,6 +144,32 @@ ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt_with_progres
     ChiavdfProgressCallback progress_cb,
     void* progress_user_data);
 
+// Batch variant: computes one proof per `jobs[i]` using a shared API surface.
+// Returns an array of `job_count` results on success; caller owns/frees it.
+ChiavdfByteArray* chiavdf_prove_one_weso_fast_streaming_getblock_opt_batch(
+    const uint8_t* challenge_hash,
+    size_t challenge_size,
+    const uint8_t* x_s,
+    size_t x_s_size,
+    size_t discriminant_size_bits,
+    const ChiavdfBatchJob* jobs,
+    size_t job_count);
+
+// Same as batch API above, with optional aggregate progress callback.
+ChiavdfByteArray* chiavdf_prove_one_weso_fast_streaming_getblock_opt_batch_with_progress(
+    const uint8_t* challenge_hash,
+    size_t challenge_size,
+    const uint8_t* x_s,
+    size_t x_s_size,
+    size_t discriminant_size_bits,
+    const ChiavdfBatchJob* jobs,
+    size_t job_count,
+    uint64_t progress_interval,
+    ChiavdfProgressCallback progress_cb,
+    void* progress_user_data);
+
+void chiavdf_free_byte_array_batch(ChiavdfByteArray* arrays, size_t count);
+
 void chiavdf_free_byte_array(ChiavdfByteArray array);
 
 #ifdef __cplusplus

From 95f8ff18d3adcfd291767fbc35273a202d6c645d Mon Sep 17 00:00:00 2001
From: Gene Hoffman <hoffmang@hoffmang.com>
Date: Tue, 24 Feb 2026 00:51:52 -0800
Subject: [PATCH 07/21] Make fast-thread counter slots build-configurable.

Default CHIA_VDF_FAST_COUNTER_SLOTS to 100 in threading.h so upstream builds keep lower BSS usage while allowing embedded deployments to override via compiler defines.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 src/threading.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/threading.h b/src/threading.h
index 8354d824..dbb18592 100644
--- a/src/threading.h
+++ b/src/threading.h
@@ -566,8 +566,12 @@ struct alignas(64) thread_counter {
     }
 };
 
-thread_counter master_counter[512];
-thread_counter slave_counter[512];
+#ifndef CHIA_VDF_FAST_COUNTER_SLOTS
+#define CHIA_VDF_FAST_COUNTER_SLOTS 100
+#endif
+
+thread_counter master_counter[CHIA_VDF_FAST_COUNTER_SLOTS];
+thread_counter slave_counter[CHIA_VDF_FAST_COUNTER_SLOTS];
 
 struct thread_state {
     int pairindex;

From 746ba2e8edadadb0e81951c4bf93aaf8a5a3dbe0 Mon Sep 17 00:00:00 2001
From: Gene Hoffman <hoffmang@hoffmang.com>
Date: Tue, 24 Feb 2026 00:55:50 -0800
Subject: [PATCH 08/21] Fix fast pairindex allocator state across translation
 units.

Use one program-wide atomic slot allocator for `vdf_fast_pairindex()` so concurrent VDF computations started from different translation units cannot collide on shared fast counter slots.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 src/vdf.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/vdf.h b/src/vdf.h
index 8ca75d8c..575cc78a 100644
--- a/src/vdf.h
+++ b/src/vdf.h
@@ -93,11 +93,16 @@ bool quiet_mode = false;
 // The upstream chiavdf binaries run one VDF per process and hardcode `pairindex=0`.
 // In embedded/multi-worker setups (like WesoForge), multiple VDF computations can
 // run concurrently in the same process; they must not share a pairindex.
+#if (defined(ARCH_X86) || defined(ARCH_X64)) && !defined(CHIA_DISABLE_ASM)
+// Keep slot allocation state as one program-wide entity for all TUs that include
+// this header, so concurrent callers cannot recycle the same slot sequence.
+inline std::atomic<unsigned int> vdf_fast_next_slot{0};
+#endif
+
 inline int vdf_fast_pairindex() {
 #if (defined(ARCH_X86) || defined(ARCH_X64)) && !defined(CHIA_DISABLE_ASM)
     constexpr unsigned int kSlots = unsigned(sizeof(master_counter) / sizeof(master_counter[0]));
-    static std::atomic<unsigned int> next_slot{0};
-    thread_local int slot = int(next_slot.fetch_add(1u, std::memory_order_relaxed) % kSlots);
+    thread_local int slot = int(vdf_fast_next_slot.fetch_add(1u, std::memory_order_relaxed) % kSlots);
     return slot;
 #else
     return 0;

From 707b2f46e3499b2ba65547693401ceb7a4f069a3 Mon Sep 17 00:00:00 2001
From: Gene Hoffman <hoffmang@hoffmang.com>
Date: Tue, 24 Feb 2026 00:58:53 -0800
Subject: [PATCH 09/21] Guard streaming prover bucket shifts against invalid k.

Reject k>=64 before any 64-bit left-shift and reuse validated bucket spans for allocation, indexing, and finalization loops so invalid parameter tuning cannot trigger undefined behavior.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 src/c_bindings/fast_wrapper.cpp | 53 ++++++++++++++++++++++++++-------
 1 file changed, 42 insertions(+), 11 deletions(-)

diff --git a/src/c_bindings/fast_wrapper.cpp b/src/c_bindings/fast_wrapper.cpp
index d660ee80..af3bf805 100644
--- a/src/c_bindings/fast_wrapper.cpp
+++ b/src/c_bindings/fast_wrapper.cpp
@@ -72,6 +72,14 @@ uint64_t saturating_add_u64(uint64_t lhs, uint64_t rhs) {
     return lhs + rhs;
 }
 
+bool try_pow2_u64_shift(uint32_t shift, uint64_t& out) {
+    if (shift >= 64) {
+        return false;
+    }
+    out = 1ULL << shift;
+    return true;
+}
+
 void free_byte_array_batch_internal(ChiavdfByteArray* arrays, size_t count) {
     if (arrays == nullptr) {
         return;
@@ -289,7 +297,19 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
           use_getblock_opt(use_getblock_opt),
           stats_enabled(streaming_stats_enabled.load(std::memory_order_relaxed)) {
         form id = form::identity(D);
-        buckets.resize(static_cast<size_t>(l) * (1ULL << k), id);
+        uint64_t bucket_span_u64 = 0;
+        if (!try_pow2_u64_shift(k, bucket_span_u64)) {
+            getblock_ok = false;
+            return;
+        }
+
+        bucket_span = static_cast<size_t>(bucket_span_u64);
+        if (bucket_span != 0 && static_cast<size_t>(l) > std::numeric_limits<size_t>::max() / bucket_span) {
+            getblock_ok = false;
+            return;
+        }
+
+        buckets.resize(static_cast<size_t>(l) * bucket_span, id);
 
         if (use_getblock_opt) {
             getblock_ok = init_getblock_opt_state();
@@ -423,29 +443,35 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
 
         uint64_t k1 = k / 2;
         uint64_t k0 = k - k1;
+        uint64_t span_k0 = 0;
+        uint64_t span_k1 = 0;
+        if (!try_pow2_u64_shift(static_cast<uint32_t>(k0), span_k0) ||
+            !try_pow2_u64_shift(static_cast<uint32_t>(k1), span_k1)) {
+            return form::identity(D);
+        }
         form x = id;
 
         for (int64_t j = static_cast<int64_t>(l) - 1; j >= 0; j--) {
-            x = FastPowFormNucomp(x, D, integer(static_cast<uint64_t>(1) << k), L, reducer);
+            x = FastPowFormNucomp(x, D, integer(static_cast<uint64_t>(bucket_span)), L, reducer);
 
-            for (uint64_t b1 = 0; b1 < (1ULL << k1); b1++) {
+            for (uint64_t b1 = 0; b1 < span_k1; b1++) {
                 form z = id;
-                for (uint64_t b0 = 0; b0 < (1ULL << k0); b0++) {
-                    nucomp_form(z, z, bucket(static_cast<uint32_t>(j), b1 * (1ULL << k0) + b0), D, L);
+                for (uint64_t b0 = 0; b0 < span_k0; b0++) {
+                    nucomp_form(z, z, bucket(static_cast<uint32_t>(j), b1 * span_k0 + b0), D, L);
                 }
                 z = FastPowFormNucomp(
                     z,
                     D,
-                    integer(static_cast<uint64_t>(b1 * (1ULL << k0))),
+                    integer(static_cast<uint64_t>(b1 * span_k0)),
                     L,
                     reducer);
                 nucomp_form(x, x, z, D, L);
             }
 
-            for (uint64_t b0 = 0; b0 < (1ULL << k0); b0++) {
+            for (uint64_t b0 = 0; b0 < span_k0; b0++) {
                 form z = id;
-                for (uint64_t b1 = 0; b1 < (1ULL << k1); b1++) {
-                    nucomp_form(z, z, bucket(static_cast<uint32_t>(j), b1 * (1ULL << k0) + b0), D, L);
+                for (uint64_t b1 = 0; b1 < span_k1; b1++) {
+                    nucomp_form(z, z, bucket(static_cast<uint32_t>(j), b1 * span_k0 + b0), D, L);
                 }
                 z = FastPowFormNucomp(z, D, integer(b0), L, reducer);
                 nucomp_form(x, x, z, D, L);
@@ -478,12 +504,12 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
 
   private:
     form& bucket(uint32_t j, uint64_t b) {
-        size_t idx = static_cast<size_t>(j) * (1ULL << k) + static_cast<size_t>(b);
+        size_t idx = static_cast<size_t>(j) * bucket_span + static_cast<size_t>(b);
         return buckets[idx];
     }
 
     const form& bucket(uint32_t j, uint64_t b) const {
-        size_t idx = static_cast<size_t>(j) * (1ULL << k) + static_cast<size_t>(b);
+        size_t idx = static_cast<size_t>(j) * bucket_span + static_cast<size_t>(b);
         return buckets[idx];
     }
 
@@ -497,6 +523,7 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
     ChiavdfProgressCallback progress_cb;
     void* progress_user_data;
     uint64_t next_progress;
+    size_t bucket_span = 0;
 
     std::vector<form> buckets;
     form result;
@@ -620,6 +647,10 @@ ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_impl(
     if (l == 0) {
         l = 1;
     }
+    uint64_t ignored_bucket_span = 0;
+    if (!try_pow2_u64_shift(k, ignored_bucket_span)) {
+        return empty_result();
+    }
 
     last_streaming_parameters.k = k;
     last_streaming_parameters.l = l;

From 0c11002ba3aabb9cdb2f65bf31d9c8cd5393fb73 Mon Sep 17 00:00:00 2001
From: Gene Hoffman <hoffmang@hoffmang.com>
Date: Tue, 12 May 2026 19:17:42 -0700
Subject: [PATCH 10/21] Harden fast counter slot safety and macOS cmake setup.

Add compile-time guards that reject zero fast-counter slot configurations before modulo indexing, and export Homebrew's cmake path in macOS workflows so cmake is available within the same step on Intel runners.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .github/workflows/build-c-libraries.yml | 1 +
 .github/workflows/build.yml             | 1 +
 src/threading.h                         | 2 ++
 src/vdf.h                               | 1 +
 4 files changed, 5 insertions(+)

diff --git a/.github/workflows/build-c-libraries.yml b/.github/workflows/build-c-libraries.yml
index db833104..451fabee 100644
--- a/.github/workflows/build-c-libraries.yml
+++ b/.github/workflows/build-c-libraries.yml
@@ -90,6 +90,7 @@ jobs:
         CMAKE_BIN="$(brew --prefix cmake)/bin"
         if [ -d "$CMAKE_BIN" ]; then
           echo "$CMAKE_BIN" >> "$GITHUB_PATH"
+          export PATH="$CMAKE_BIN:$PATH"
         fi
         cmake --version
 
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 4ad967ec..798241d7 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -110,6 +110,7 @@ jobs:
         CMAKE_BIN="$(brew --prefix cmake)/bin"
         if [ -d "$CMAKE_BIN" ]; then
           echo "$CMAKE_BIN" >> "$GITHUB_PATH"
+          export PATH="$CMAKE_BIN:$PATH"
         fi
         cmake --version
 
diff --git a/src/threading.h b/src/threading.h
index dbb18592..3574a98f 100644
--- a/src/threading.h
+++ b/src/threading.h
@@ -570,6 +570,8 @@ struct alignas(64) thread_counter {
 #define CHIA_VDF_FAST_COUNTER_SLOTS 100
 #endif
 
+static_assert(CHIA_VDF_FAST_COUNTER_SLOTS > 0, "CHIA_VDF_FAST_COUNTER_SLOTS must be > 0");
+
 thread_counter master_counter[CHIA_VDF_FAST_COUNTER_SLOTS];
 thread_counter slave_counter[CHIA_VDF_FAST_COUNTER_SLOTS];
 
diff --git a/src/vdf.h b/src/vdf.h
index 575cc78a..eb1d0d39 100644
--- a/src/vdf.h
+++ b/src/vdf.h
@@ -102,6 +102,7 @@ inline std::atomic<unsigned int> vdf_fast_next_slot{0};
 inline int vdf_fast_pairindex() {
 #if (defined(ARCH_X86) || defined(ARCH_X64)) && !defined(CHIA_DISABLE_ASM)
     constexpr unsigned int kSlots = unsigned(sizeof(master_counter) / sizeof(master_counter[0]));
+    static_assert(kSlots > 0, "CHIA_VDF_FAST_COUNTER_SLOTS must be > 0");
     thread_local int slot = int(vdf_fast_next_slot.fetch_add(1u, std::memory_order_relaxed) % kSlots);
     return slot;
 #else

From 1e7342199beb8e0d199f2201e6040ef5b4f6ca94 Mon Sep 17 00:00:00 2001
From: Gene Hoffman <hoffmang@hoffmang.com>
Date: Tue, 12 May 2026 19:37:23 -0700
Subject: [PATCH 11/21] Remove stale patch artifact and refine tuner update
 cost.

Drop the root-level development patch file that diverged from the live implementation, and adjust the streaming tuner cost model so bucket-update work scales with checkpoint count and `l` instead of only `k`.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 pr1_upstream_ready.patch        | 1158 -------------------------------
 src/c_bindings/fast_wrapper.cpp |    5 +-
 2 files changed, 3 insertions(+), 1160 deletions(-)
 delete mode 100644 pr1_upstream_ready.patch

diff --git a/pr1_upstream_ready.patch b/pr1_upstream_ready.patch
deleted file mode 100644
index b14a93bb..00000000
--- a/pr1_upstream_ready.patch
+++ /dev/null
@@ -1,1158 +0,0 @@
-diff --git a/src/Makefile.vdf-client b/src/Makefile.vdf-client
-index ed41963..ca55a95 100644
---- a/src/Makefile.vdf-client
-+++ b/src/Makefile.vdf-client
-@@ -6,9 +6,24 @@ else
- NOPIE = -no-pie
- endif
- 
--LDFLAGS += -flto $(NOPIE) -g
-+# Optional: override `LTO=` to disable link-time optimization.
-+LTO ?= -flto
-+
-+# Optional: set `PIC=1` to build position-independent objects (recommended when
-+# linking chiavdf code into other PIE/shared-library binaries).
-+PIC ?= 0
-+ifeq ($(PIC),1)
-+PICFLAGS = -fPIC
-+PIEFLAGS =
-+else
-+PICFLAGS =
-+PIEFLAGS = $(NOPIE)
-+endif
-+
-+LDFLAGS += $(LTO) $(PIEFLAGS) -g
- LDLIBS += -lgmpxx -lgmp -pthread
--CXXFLAGS += -flto -std=c++1z -D VDF_MODE=0 -D FAST_MACHINE=1 -pthread $(NOPIE) -fvisibility=hidden
-+CXXFLAGS += $(LTO) -std=c++1z -D VDF_MODE=0 -D FAST_MACHINE=1 -pthread $(PIEFLAGS) $(PICFLAGS) -fvisibility=hidden
-+ASFLAGS += $(PICFLAGS)
- ifeq ($(UNAME),Darwin)
- CXXFLAGS += -D CHIAOSX=1
- endif
-@@ -31,7 +46,7 @@ BINS = vdf_client prover_test 1weso_test 2weso_test vdf_bench
- all: $(BINS)
- 
- clean:
--	rm -f *.o hw/*.o $(BINS) compile_asm emu_hw_test hw_test hw_vdf_client emu_hw_vdf_client
-+	rm -f *.o hw/*.o c_bindings/*.o $(BINS) compile_asm emu_hw_test hw_test hw_vdf_client emu_hw_vdf_client libchiavdf_fastc.a
- 
- $(BINS) avx512_test: %: %.o lzcnt.o asm_compiled.o avx2_asm_compiled.o avx512_asm_compiled.o
- 	$(CXX) $(LDFLAGS) -o $@ $^ $(LDLIBS)
-@@ -39,7 +54,10 @@ $(BINS) avx512_test: %: %.o lzcnt.o asm_compiled.o avx2_asm_compiled.o avx512_as
- $(addsuffix .o,$(BINS)) avx512_test.o: CXXFLAGS += $(OPT_CFLAGS)
- 
- lzcnt.o: refcode/lzcnt.c
--	$(CC) -c refcode/lzcnt.c
-+	$(CC) -c refcode/lzcnt.c $(OPT_CFLAGS) $(PICFLAGS)
-+
-+%.o: %.s
-+	$(CC) -c $< -o $@ $(ASFLAGS)
- 
- asm_compiled.s: compile_asm
- 	./compile_asm
-@@ -53,6 +71,22 @@ avx512_asm_compiled.s: compile_asm
- compile_asm: compile_asm.o
- 	$(CXX) $(LDFLAGS) -o $@ $^ $(LDLIBS)
- 
-+# ---------------------------------------------------------------------------
-+# Static library: fast one-wesolowski proof (BBR integration)
-+# ---------------------------------------------------------------------------
-+
-+FASTLIB = libchiavdf_fastc.a
-+FASTLIB_OBJS = c_bindings/fast_wrapper.o lzcnt.o asm_compiled.o avx2_asm_compiled.o avx512_asm_compiled.o
-+
-+.PHONY: fastlib
-+
-+fastlib: $(FASTLIB)
-+
-+$(FASTLIB): $(FASTLIB_OBJS)
-+	$(AR) rcs $@ $^
-+
-+c_bindings/fast_wrapper.o: CXXFLAGS += $(OPT_CFLAGS)
-+
- HW_OBJS = $(addprefix hw/,hw_util.o hw_proof.o hw_interface.o chia_driver.o ftdi_driver.o vdf_driver.o pll_freqs.o) vdf_base.o lzcnt.o
- EMU_OBJS = hw/emu_funcs.o hw/emu_runner.o
- HW_LIB = hw/libft4222/build-x86_64/libft4222.so
-diff --git a/src/c_bindings/fast_wrapper.cpp b/src/c_bindings/fast_wrapper.cpp
-new file mode 100644
-index 0000000..198d0a8
---- /dev/null
-+++ b/src/c_bindings/fast_wrapper.cpp
-@@ -0,0 +1,795 @@
-+#include "fast_wrapper.h"
-+
-+#include <atomic>
-+#include <chrono>
-+#include <limits>
-+#include <mutex>
-+#include <vector>
-+
-+#include "../vdf.h"
-+#include "../create_discriminant.h"
-+
-+// Runtime configuration knobs required by `parameters.h`.
-+// These are `extern` variables there, but each binary defines them explicitly.
-+bool use_divide_table = false;
-+int gcd_base_bits = 50;
-+int gcd_128_max_iter = 3;
-+std::string asmprefix = "cel_";
-+bool enable_all_instructions = false;
-+
-+namespace {
-+std::once_flag init_once;
-+std::atomic<uint64_t> bucket_memory_budget_bytes(128ULL * 1024ULL * 1024ULL);
-+std::atomic<bool> streaming_stats_enabled(false);
-+
-+struct LastStreamingParameters {
-+    uint32_t k = 0;
-+    uint32_t l = 0;
-+    bool tuned = false;
-+    bool set = false;
-+};
-+
-+thread_local LastStreamingParameters last_streaming_parameters;
-+
-+struct LastStreamingStats {
-+    uint64_t checkpoint_total_ns = 0;
-+    uint64_t checkpoint_event_total_ns = 0;
-+    uint64_t finalize_total_ns = 0;
-+    uint64_t checkpoint_calls = 0;
-+    uint64_t bucket_updates = 0;
-+    bool set = false;
-+};
-+
-+thread_local LastStreamingStats last_streaming_stats;
-+
-+void init_chiavdf_fast() {
-+    init_gmp();
-+    set_rounding_mode();
-+
-+    // Match the vdf_client runtime selection for AVX2.
-+    if (hasAVX2()) {
-+        gcd_base_bits = 63;
-+        gcd_128_max_iter = 2;
-+    } else {
-+        gcd_base_bits = 50;
-+        gcd_128_max_iter = 3;
-+    }
-+
-+    // Ensure we run the one-wesolowski path by default.
-+    fast_algorithm = false;
-+    two_weso = false;
-+    quiet_mode = true;
-+}
-+
-+ChiavdfByteArray empty_result() { return ChiavdfByteArray{nullptr, 0}; }
-+
-+uint64_t estimate_bucket_form_bytes(size_t discriminant_size_bits) {
-+    // Be conservative: class group forms contain 3 GMP-backed integers that
-+    // quickly grow to the discriminant size (or beyond) during NUCOMP.
-+    //
-+    // This estimate is intentionally larger than the raw serialized size to
-+    // avoid picking parameters that risk paging/OOM.
-+    uint64_t discr_bytes = (static_cast<uint64_t>(discriminant_size_bits) + 7) / 8;
-+    uint64_t estimate = discr_bytes * 16;
-+    if (estimate < 2048) {
-+        estimate = 2048;
-+    }
-+    return estimate;
-+}
-+
-+bool tune_streaming_parameters(
-+    uint64_t num_iterations,
-+    size_t discriminant_size_bits,
-+    uint64_t memory_budget_bytes,
-+    uint32_t& out_l,
-+    uint32_t& out_k) {
-+    if (memory_budget_bytes == 0) {
-+        return false;
-+    }
-+
-+    // Keep headroom for GMP scratch allocations and general process overhead.
-+    uint64_t budget = (memory_budget_bytes * 80) / 100;
-+    uint64_t bytes_per_form = estimate_bucket_form_bytes(discriminant_size_bits);
-+    if (budget < bytes_per_form) {
-+        return false;
-+    }
-+
-+    unsigned __int128 best_cost = std::numeric_limits<unsigned __int128>::max();
-+    bool found = false;
-+
-+    // Empirical tuning notes (1024-bit discriminants, AVX2 build):
-+    // - Each bucket update (NUCOMP) and each fold unit is ~5µs.
-+    // - Per-checkpoint event overhead (SetForm + bookkeeping) is ~0.3µs.
-+    //
-+    // So checkpoint counts should be weighted much lower than updates/fold.
-+    constexpr unsigned __int128 update_weight = 16;
-+    constexpr unsigned __int128 fold_weight = 16;
-+    constexpr unsigned __int128 checkpoint_weight = 1;
-+
-+    // Search a small grid of `(k,l)` values. Higher `k` reduces checkpoint work
-+    // (~T/k) but increases fold work (~l·2^k) and bucket memory (~l·2^k).
-+    for (uint32_t k = 4; k <= 20; k++) {
-+        unsigned __int128 buckets_per_row = static_cast<unsigned __int128>(1) << k;
-+
-+        for (uint32_t l = 1; l <= 64; l++) {
-+            unsigned __int128 form_count = buckets_per_row * static_cast<unsigned __int128>(l);
-+            unsigned __int128 mem_required =
-+                form_count * static_cast<unsigned __int128>(bytes_per_form);
-+            if (mem_required > static_cast<unsigned __int128>(budget)) {
-+                continue;
-+            }
-+
-+            unsigned __int128 updates = static_cast<unsigned __int128>(
-+                (num_iterations + static_cast<uint64_t>(k) - 1) / static_cast<uint64_t>(k));
-+            uint64_t kl = static_cast<uint64_t>(k) * static_cast<uint64_t>(l);
-+            unsigned __int128 checkpoints = static_cast<unsigned __int128>(
-+                (num_iterations + kl - 1) / kl);
-+            unsigned __int128 fold = static_cast<unsigned __int128>(l) << (k + 1);
-+            unsigned __int128 cost =
-+                updates * update_weight + checkpoints * checkpoint_weight + fold * fold_weight;
-+
-+            if (!found || cost < best_cost) {
-+                found = true;
-+                best_cost = cost;
-+                out_k = k;
-+                out_l = l;
-+            }
-+        }
-+    }
-+
-+    return found;
-+}
-+
-+uint64_t get_block(uint64_t i, uint64_t k, uint64_t T, integer& B) {
-+    integer res = FastPow(2, T - k * (i + 1), B);
-+    mpz_mul_2exp(res.impl, res.impl, k);
-+    res = res / B;
-+    auto res_vector = res.to_vector();
-+    return res_vector.empty() ? 0 : res_vector[0];
-+}
-+
-+class ProgressOneWesolowskiCallback final : public OneWesolowskiCallback {
-+  public:
-+    ProgressOneWesolowskiCallback(
-+        integer& D,
-+        form& f,
-+        uint64_t wanted_iter,
-+        uint64_t progress_interval,
-+        ChiavdfProgressCallback progress_cb,
-+        void* progress_user_data)
-+        : OneWesolowskiCallback(D, f, wanted_iter),
-+          progress_interval(progress_interval),
-+          progress_cb(progress_cb),
-+          progress_user_data(progress_user_data),
-+          next_progress(progress_interval) {}
-+
-+    void OnIteration(int type, void* data, uint64_t iteration) override {
-+        OneWesolowskiCallback::OnIteration(type, data, iteration);
-+
-+        if (progress_cb == nullptr || progress_interval == 0) {
-+            return;
-+        }
-+
-+        uint64_t done = iteration + 1;
-+        if (done > wanted_iter) {
-+            return;
-+        }
-+
-+        if (done >= next_progress) {
-+            progress_cb(next_progress, progress_user_data);
-+            next_progress += progress_interval;
-+        }
-+    }
-+
-+  private:
-+    uint64_t progress_interval;
-+    ChiavdfProgressCallback progress_cb;
-+    void* progress_user_data;
-+    uint64_t next_progress;
-+};
-+
-+class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
-+  public:
-+    StreamingOneWesolowskiCallback(
-+        integer& D,
-+        uint64_t wanted_iter,
-+        uint32_t k,
-+        uint32_t l,
-+        uint64_t limit,
-+        integer& B,
-+        bool use_getblock_opt,
-+        uint64_t progress_interval,
-+        ChiavdfProgressCallback progress_cb,
-+        void* progress_user_data)
-+        : WesolowskiCallback(D),
-+          wanted_iter(wanted_iter),
-+          k(k),
-+          l(l),
-+          kl(static_cast<uint64_t>(k) * static_cast<uint64_t>(l)),
-+          limit(limit),
-+          B(B),
-+          progress_interval(progress_interval),
-+          progress_cb(progress_cb),
-+          progress_user_data(progress_user_data),
-+          next_progress(progress_interval),
-+          use_getblock_opt(use_getblock_opt),
-+          stats_enabled(streaming_stats_enabled.load(std::memory_order_relaxed)) {
-+        form id = form::identity(D);
-+        buckets.resize(static_cast<size_t>(l) * (1ULL << k), id);
-+
-+        if (use_getblock_opt) {
-+            getblock_ok = init_getblock_opt_state();
-+        }
-+    }
-+
-+    void OnIteration(int type, void* data, uint64_t iteration) override {
-+        iteration++;
-+        if (iteration > wanted_iter) {
-+            return;
-+        }
-+
-+        if (progress_cb != nullptr && progress_interval != 0 && iteration >= next_progress) {
-+            progress_cb(next_progress, progress_user_data);
-+            next_progress += progress_interval;
-+        }
-+
-+        if (iteration % kl == 0) {
-+            uint64_t pos = iteration / kl;
-+            if (pos < limit) {
-+                form checkpoint;
-+                auto started_at = std::chrono::steady_clock::time_point{};
-+                if (stats_enabled) {
-+                    started_at = std::chrono::steady_clock::now();
-+                }
-+                SetForm(type, data, &checkpoint);
-+                process_checkpoint(pos, checkpoint, /*record_stats=*/true);
-+                if (stats_enabled) {
-+                    checkpoint_event_total_ns += static_cast<uint64_t>(
-+                        std::chrono::duration_cast<std::chrono::nanoseconds>(
-+                            std::chrono::steady_clock::now() - started_at)
-+                            .count());
-+                }
-+            }
-+        }
-+
-+        if (iteration == wanted_iter) {
-+            SetForm(type, data, &result);
-+            has_result = true;
-+        }
-+    }
-+
-+    void process_checkpoint(uint64_t i, const form& checkpoint, bool record_stats) {
-+        const bool do_stats = stats_enabled && record_stats;
-+        auto started_at = std::chrono::steady_clock::time_point{};
-+        if (do_stats) {
-+            started_at = std::chrono::steady_clock::now();
-+        }
-+
-+        uint64_t local_updates = 0;
-+        for (uint32_t j = 0; j < l; j++) {
-+            uint64_t p = i * static_cast<uint64_t>(l) + static_cast<uint64_t>(j);
-+            uint64_t needed = static_cast<uint64_t>(k) * (p + 1);
-+            if (wanted_iter < needed) {
-+                break;
-+            }
-+            uint64_t b = use_getblock_opt ? get_block_opt(p) : get_block(p, k, wanted_iter, B);
-+            if (do_stats) {
-+                local_updates++;
-+            }
-+            nucomp_form(bucket(j, b), bucket(j, b), checkpoint, D, L);
-+        }
-+
-+        if (do_stats) {
-+            checkpoint_calls++;
-+            bucket_updates += local_updates;
-+            checkpoint_total_ns += static_cast<uint64_t>(
-+                std::chrono::duration_cast<std::chrono::nanoseconds>(
-+                    std::chrono::steady_clock::now() - started_at)
-+                    .count());
-+        }
-+    }
-+
-+    bool init_ok() const { return getblock_ok; }
-+
-+    bool ok() const { return has_result; }
-+
-+    const form& y() const { return result; }
-+
-+    form finalize_proof() {
-+        auto started_at = std::chrono::steady_clock::time_point{};
-+        if (stats_enabled) {
-+            started_at = std::chrono::steady_clock::now();
-+        }
-+
-+        PulmarkReducer reducer;
-+        form id = form::identity(D);
-+
-+        uint64_t k1 = k / 2;
-+        uint64_t k0 = k - k1;
-+        form x = id;
-+
-+        for (int64_t j = static_cast<int64_t>(l) - 1; j >= 0; j--) {
-+            x = FastPowFormNucomp(x, D, integer(static_cast<uint64_t>(1) << k), L, reducer);
-+
-+            for (uint64_t b1 = 0; b1 < (1ULL << k1); b1++) {
-+                form z = id;
-+                for (uint64_t b0 = 0; b0 < (1ULL << k0); b0++) {
-+                    nucomp_form(z, z, bucket(static_cast<uint32_t>(j), b1 * (1ULL << k0) + b0), D, L);
-+                }
-+                z = FastPowFormNucomp(
-+                    z,
-+                    D,
-+                    integer(static_cast<uint64_t>(b1 * (1ULL << k0))),
-+                    L,
-+                    reducer);
-+                nucomp_form(x, x, z, D, L);
-+            }
-+
-+            for (uint64_t b0 = 0; b0 < (1ULL << k0); b0++) {
-+                form z = id;
-+                for (uint64_t b1 = 0; b1 < (1ULL << k1); b1++) {
-+                    nucomp_form(z, z, bucket(static_cast<uint32_t>(j), b1 * (1ULL << k0) + b0), D, L);
-+                }
-+                z = FastPowFormNucomp(z, D, integer(b0), L, reducer);
-+                nucomp_form(x, x, z, D, L);
-+            }
-+        }
-+
-+        reducer.reduce(x);
-+
-+        if (stats_enabled) {
-+            finalize_total_ns += static_cast<uint64_t>(
-+                std::chrono::duration_cast<std::chrono::nanoseconds>(
-+                    std::chrono::steady_clock::now() - started_at)
-+                    .count());
-+        }
-+        return x;
-+    }
-+
-+    bool stats_ok() const { return stats_enabled; }
-+
-+    LastStreamingStats stats() const {
-+        LastStreamingStats out;
-+        out.checkpoint_total_ns = checkpoint_total_ns;
-+        out.checkpoint_event_total_ns = checkpoint_event_total_ns;
-+        out.finalize_total_ns = finalize_total_ns;
-+        out.checkpoint_calls = checkpoint_calls;
-+        out.bucket_updates = bucket_updates;
-+        out.set = stats_enabled;
-+        return out;
-+    }
-+
-+  private:
-+    form& bucket(uint32_t j, uint64_t b) {
-+        size_t idx = static_cast<size_t>(j) * (1ULL << k) + static_cast<size_t>(b);
-+        return buckets[idx];
-+    }
-+
-+    const form& bucket(uint32_t j, uint64_t b) const {
-+        size_t idx = static_cast<size_t>(j) * (1ULL << k) + static_cast<size_t>(b);
-+        return buckets[idx];
-+    }
-+
-+    uint64_t wanted_iter;
-+    uint32_t k;
-+    uint32_t l;
-+    uint64_t kl;
-+    uint64_t limit;
-+    integer B;
-+    uint64_t progress_interval;
-+    ChiavdfProgressCallback progress_cb;
-+    void* progress_user_data;
-+    uint64_t next_progress;
-+
-+    std::vector<form> buckets;
-+    form result;
-+    bool has_result = false;
-+
-+    bool use_getblock_opt;
-+    bool getblock_ok = true;
-+    uint64_t getblock_next_p = 0;
-+    integer getblock_inv_2k;
-+    integer getblock_r;
-+    integer getblock_tmp;
-+
-+    bool stats_enabled;
-+    uint64_t checkpoint_total_ns = 0;
-+    uint64_t checkpoint_event_total_ns = 0;
-+    uint64_t finalize_total_ns = 0;
-+    uint64_t checkpoint_calls = 0;
-+    uint64_t bucket_updates = 0;
-+
-+    bool init_getblock_opt_state() {
-+        if (k == 0) {
-+            return false;
-+        }
-+        uint64_t k_u64 = static_cast<uint64_t>(k);
-+        if (wanted_iter < k_u64) {
-+            return true;
-+        }
-+
-+        integer two_k_mod = FastPow(2, k_u64, B);
-+        if (mpz_invert(getblock_inv_2k.impl, two_k_mod.impl, B.impl) == 0) {
-+            return false;
-+        }
-+
-+        getblock_r = FastPow(2, wanted_iter - k_u64, B);
-+        getblock_next_p = 0;
-+        return true;
-+    }
-+
-+    uint64_t get_block_opt(uint64_t p) {
-+        if (!getblock_ok || wanted_iter < static_cast<uint64_t>(k)) {
-+            return get_block(p, k, wanted_iter, B);
-+        }
-+
-+        // Expected call pattern is sequential `p`. If we ever get out of sync,
-+        // advance state forward or fall back to the slow mapping.
-+        if (p < getblock_next_p) {
-+            return get_block(p, k, wanted_iter, B);
-+        }
-+        while (getblock_next_p < p) {
-+            mpz_mul(getblock_r.impl, getblock_r.impl, getblock_inv_2k.impl);
-+            mpz_mod(getblock_r.impl, getblock_r.impl, B.impl);
-+            getblock_next_p++;
-+        }
-+
-+        mpz_mul_2exp(getblock_tmp.impl, getblock_r.impl, k);
-+        mpz_fdiv_q(getblock_tmp.impl, getblock_tmp.impl, B.impl);
-+        uint64_t b = mpz_get_ui(getblock_tmp.impl);
-+
-+        mpz_mul(getblock_r.impl, getblock_r.impl, getblock_inv_2k.impl);
-+        mpz_mod(getblock_r.impl, getblock_r.impl, B.impl);
-+        getblock_next_p++;
-+
-+        return b;
-+    }
-+};
-+
-+ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_impl(
-+    const uint8_t* challenge_hash,
-+    size_t challenge_size,
-+    const uint8_t* x_s,
-+    size_t x_s_size,
-+    const uint8_t* y_ref_s,
-+    size_t y_ref_s_size,
-+    size_t discriminant_size_bits,
-+    uint64_t num_iterations,
-+    uint64_t progress_interval,
-+    ChiavdfProgressCallback progress_cb,
-+    void* progress_user_data,
-+    bool use_getblock_opt) {
-+    std::call_once(init_once, init_chiavdf_fast);
-+
-+    last_streaming_stats = LastStreamingStats{};
-+
-+    if (challenge_hash == nullptr || challenge_size == 0 || x_s == nullptr || x_s_size == 0 ||
-+        y_ref_s == nullptr || y_ref_s_size == 0) {
-+        return empty_result();
-+    }
-+    if (num_iterations == 0) {
-+        return empty_result();
-+    }
-+
-+    std::vector<uint8_t> challenge_hash_bytes(challenge_hash, challenge_hash + challenge_size);
-+    integer D = CreateDiscriminant(challenge_hash_bytes, static_cast<int>(discriminant_size_bits));
-+    integer L = root(-D, 4);
-+
-+    form x = DeserializeForm(D, x_s, x_s_size);
-+    form y_ref = DeserializeForm(D, y_ref_s, y_ref_s_size);
-+
-+    uint32_t k;
-+    uint32_t l;
-+    bool tuned = false;
-+    const uint64_t budget =
-+        bucket_memory_budget_bytes.load(std::memory_order_relaxed);
-+    if (num_iterations >= (1 << 16)) {
-+        tuned = tune_streaming_parameters(num_iterations, discriminant_size_bits, budget, l, k);
-+    }
-+    if (!tuned) {
-+        if (num_iterations >= (1 << 16)) {
-+            ApproximateParameters(num_iterations, l, k);
-+        } else {
-+            k = 10;
-+            l = 1;
-+        }
-+    }
-+    if (k == 0) {
-+        k = 1;
-+    }
-+    if (l == 0) {
-+        l = 1;
-+    }
-+
-+    last_streaming_parameters.k = k;
-+    last_streaming_parameters.l = l;
-+    last_streaming_parameters.tuned = tuned;
-+    last_streaming_parameters.set = true;
-+
-+    uint64_t kl = static_cast<uint64_t>(k) * static_cast<uint64_t>(l);
-+    uint64_t limit = num_iterations / kl;
-+    if (num_iterations % kl) {
-+        limit++;
-+    }
-+
-+    integer B = GetB(D, x, y_ref);
-+
-+    std::atomic<bool> stopped(false);
-+    StreamingOneWesolowskiCallback weso(
-+        D,
-+        num_iterations,
-+        k,
-+        l,
-+        limit,
-+        B,
-+        use_getblock_opt,
-+        progress_interval,
-+        progress_cb,
-+        progress_user_data);
-+
-+    if (!weso.init_ok()) {
-+        return empty_result();
-+    }
-+
-+    weso.process_checkpoint(/*i=*/0, x, /*record_stats=*/false);
-+
-+    FastStorage* fast_storage = nullptr;
-+    repeated_square(num_iterations, x, D, L, &weso, fast_storage, stopped);
-+
-+    if (!weso.ok()) {
-+        return empty_result();
-+    }
-+    if (!(weso.y() == y_ref)) {
-+        return empty_result();
-+    }
-+
-+    form proof_form = weso.finalize_proof();
-+
-+    if (weso.stats_ok()) {
-+        last_streaming_stats = weso.stats();
-+    }
-+
-+    int d_bits = D.num_bits();
-+    std::vector<unsigned char> y_serialized = SerializeForm(y_ref, d_bits);
-+    std::vector<unsigned char> proof_serialized = SerializeForm(proof_form, d_bits);
-+
-+    if (y_serialized.empty() || proof_serialized.empty()) {
-+        return empty_result();
-+    }
-+
-+    const size_t total = y_serialized.size() + proof_serialized.size();
-+    uint8_t* out = new uint8_t[total];
-+    std::copy(y_serialized.begin(), y_serialized.end(), out);
-+    std::copy(proof_serialized.begin(), proof_serialized.end(), out + y_serialized.size());
-+    return ChiavdfByteArray{out, total};
-+}
-+} // namespace
-+
-+extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast(
-+    const uint8_t* challenge_hash,
-+    size_t challenge_size,
-+    const uint8_t* x_s,
-+    size_t x_s_size,
-+    size_t discriminant_size_bits,
-+    uint64_t num_iterations) {
-+    return chiavdf_prove_one_weso_fast_with_progress(
-+        challenge_hash,
-+        challenge_size,
-+        x_s,
-+        x_s_size,
-+        discriminant_size_bits,
-+        num_iterations,
-+        /*progress_interval=*/0,
-+        /*progress_cb=*/nullptr,
-+        /*progress_user_data=*/nullptr);
-+}
-+
-+extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_with_progress(
-+    const uint8_t* challenge_hash,
-+    size_t challenge_size,
-+    const uint8_t* x_s,
-+    size_t x_s_size,
-+    size_t discriminant_size_bits,
-+    uint64_t num_iterations,
-+    uint64_t progress_interval,
-+    ChiavdfProgressCallback progress_cb,
-+    void* progress_user_data) {
-+    try {
-+        std::call_once(init_once, init_chiavdf_fast);
-+
-+        if (challenge_hash == nullptr || challenge_size == 0 || x_s == nullptr || x_s_size == 0) {
-+            return empty_result();
-+        }
-+        if (num_iterations == 0) {
-+            return empty_result();
-+        }
-+
-+        std::vector<uint8_t> challenge_hash_bytes(challenge_hash, challenge_hash + challenge_size);
-+        integer D = CreateDiscriminant(challenge_hash_bytes, static_cast<int>(discriminant_size_bits));
-+        integer L = root(-D, 4);
-+
-+        form x = DeserializeForm(D, x_s, x_s_size);
-+
-+        std::atomic<bool> stopped(false);
-+        ProgressOneWesolowskiCallback weso(
-+            D,
-+            x,
-+            num_iterations,
-+            progress_interval,
-+            progress_cb,
-+            progress_user_data);
-+
-+        // Run the fast repeated-squaring engine to `num_iterations`.
-+        // The callback stores all intermediates needed for the proof.
-+        FastStorage* fast_storage = nullptr;
-+        repeated_square(num_iterations, x, D, L, &weso, fast_storage, stopped);
-+
-+        // Now generate the compact proof from the stored intermediates.
-+        Proof proof = ProveOneWesolowski(num_iterations, D, x, &weso, stopped);
-+        if (proof.y.empty() || proof.proof.empty()) {
-+            return empty_result();
-+        }
-+
-+        const size_t total = proof.y.size() + proof.proof.size();
-+        uint8_t* out = new uint8_t[total];
-+        std::copy(proof.y.begin(), proof.y.end(), out);
-+        std::copy(proof.proof.begin(), proof.proof.end(), out + proof.y.size());
-+        return ChiavdfByteArray{out, total};
-+    } catch (...) {
-+        return empty_result();
-+    }
-+}
-+
-+extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming(
-+    const uint8_t* challenge_hash,
-+    size_t challenge_size,
-+    const uint8_t* x_s,
-+    size_t x_s_size,
-+    const uint8_t* y_ref_s,
-+    size_t y_ref_s_size,
-+    size_t discriminant_size_bits,
-+    uint64_t num_iterations) {
-+    return chiavdf_prove_one_weso_fast_streaming_with_progress(
-+        challenge_hash,
-+        challenge_size,
-+        x_s,
-+        x_s_size,
-+        y_ref_s,
-+        y_ref_s_size,
-+        discriminant_size_bits,
-+        num_iterations,
-+        /*progress_interval=*/0,
-+        /*progress_cb=*/nullptr,
-+        /*progress_user_data=*/nullptr);
-+}
-+
-+extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_with_progress(
-+    const uint8_t* challenge_hash,
-+    size_t challenge_size,
-+    const uint8_t* x_s,
-+    size_t x_s_size,
-+    const uint8_t* y_ref_s,
-+    size_t y_ref_s_size,
-+    size_t discriminant_size_bits,
-+    uint64_t num_iterations,
-+    uint64_t progress_interval,
-+    ChiavdfProgressCallback progress_cb,
-+    void* progress_user_data) {
-+    try {
-+        return chiavdf_prove_one_weso_fast_streaming_impl(
-+            challenge_hash,
-+            challenge_size,
-+            x_s,
-+            x_s_size,
-+            y_ref_s,
-+            y_ref_s_size,
-+            discriminant_size_bits,
-+            num_iterations,
-+            progress_interval,
-+            progress_cb,
-+            progress_user_data,
-+            /*use_getblock_opt=*/false);
-+    } catch (...) {
-+        return empty_result();
-+    }
-+}
-+
-+extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt(
-+    const uint8_t* challenge_hash,
-+    size_t challenge_size,
-+    const uint8_t* x_s,
-+    size_t x_s_size,
-+    const uint8_t* y_ref_s,
-+    size_t y_ref_s_size,
-+    size_t discriminant_size_bits,
-+    uint64_t num_iterations) {
-+    return chiavdf_prove_one_weso_fast_streaming_getblock_opt_with_progress(
-+        challenge_hash,
-+        challenge_size,
-+        x_s,
-+        x_s_size,
-+        y_ref_s,
-+        y_ref_s_size,
-+        discriminant_size_bits,
-+        num_iterations,
-+        /*progress_interval=*/0,
-+        /*progress_cb=*/nullptr,
-+        /*progress_user_data=*/nullptr);
-+}
-+
-+extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt_with_progress(
-+    const uint8_t* challenge_hash,
-+    size_t challenge_size,
-+    const uint8_t* x_s,
-+    size_t x_s_size,
-+    const uint8_t* y_ref_s,
-+    size_t y_ref_s_size,
-+    size_t discriminant_size_bits,
-+    uint64_t num_iterations,
-+    uint64_t progress_interval,
-+    ChiavdfProgressCallback progress_cb,
-+    void* progress_user_data) {
-+    try {
-+        return chiavdf_prove_one_weso_fast_streaming_impl(
-+            challenge_hash,
-+            challenge_size,
-+            x_s,
-+            x_s_size,
-+            y_ref_s,
-+            y_ref_s_size,
-+            discriminant_size_bits,
-+            num_iterations,
-+            progress_interval,
-+            progress_cb,
-+            progress_user_data,
-+            /*use_getblock_opt=*/true);
-+    } catch (...) {
-+        return empty_result();
-+    }
-+}
-+
-+extern "C" void chiavdf_set_bucket_memory_budget_bytes(uint64_t bytes) {
-+    bucket_memory_budget_bytes.store(bytes, std::memory_order_relaxed);
-+}
-+
-+extern "C" void chiavdf_set_enable_streaming_stats(bool enable) {
-+    streaming_stats_enabled.store(enable, std::memory_order_relaxed);
-+    last_streaming_stats = LastStreamingStats{};
-+}
-+
-+extern "C" bool chiavdf_get_last_streaming_parameters(uint32_t* out_k, uint32_t* out_l, bool* out_tuned) {
-+    if (out_k == nullptr || out_l == nullptr || out_tuned == nullptr) {
-+        return false;
-+    }
-+    if (!last_streaming_parameters.set) {
-+        return false;
-+    }
-+    *out_k = last_streaming_parameters.k;
-+    *out_l = last_streaming_parameters.l;
-+    *out_tuned = last_streaming_parameters.tuned;
-+    return true;
-+}
-+
-+extern "C" bool chiavdf_get_last_streaming_stats(
-+    uint64_t* out_checkpoint_total_ns,
-+    uint64_t* out_checkpoint_event_total_ns,
-+    uint64_t* out_finalize_total_ns,
-+    uint64_t* out_checkpoint_calls,
-+    uint64_t* out_bucket_updates) {
-+    if (out_checkpoint_total_ns == nullptr || out_checkpoint_event_total_ns == nullptr ||
-+        out_finalize_total_ns == nullptr || out_checkpoint_calls == nullptr ||
-+        out_bucket_updates == nullptr) {
-+        return false;
-+    }
-+    if (!last_streaming_stats.set) {
-+        return false;
-+    }
-+    *out_checkpoint_total_ns = last_streaming_stats.checkpoint_total_ns;
-+    *out_checkpoint_event_total_ns = last_streaming_stats.checkpoint_event_total_ns;
-+    *out_finalize_total_ns = last_streaming_stats.finalize_total_ns;
-+    *out_checkpoint_calls = last_streaming_stats.checkpoint_calls;
-+    *out_bucket_updates = last_streaming_stats.bucket_updates;
-+    return true;
-+}
-+
-+extern "C" void chiavdf_free_byte_array(ChiavdfByteArray array) { delete[] array.data; }
-diff --git a/src/c_bindings/fast_wrapper.h b/src/c_bindings/fast_wrapper.h
-new file mode 100644
-index 0000000..bf33f32
---- /dev/null
-+++ b/src/c_bindings/fast_wrapper.h
-@@ -0,0 +1,145 @@
-+#pragma once
-+
-+#include <stdbool.h>
-+#include <stddef.h>
-+#include <stdint.h>
-+
-+#ifdef __cplusplus
-+extern "C" {
-+#endif
-+
-+typedef struct {
-+    uint8_t* data;
-+    size_t length;
-+} ChiavdfByteArray;
-+
-+typedef void (*ChiavdfProgressCallback)(uint64_t iters_done, void* user_data);
-+
-+// Configure the per-process memory budget used by the parameter tuner when
-+// selecting `(k,l)` for streaming/bucket-based proving.
-+//
-+// The budget is per worker process (not global across multiple processes).
-+//
-+// If `bytes` is 0, the default chiavdf heuristic is used.
-+void chiavdf_set_bucket_memory_budget_bytes(uint64_t bytes);
-+
-+// Debug helper: returns the `(k,l)` parameters selected for the most recent
-+// streaming proof computed on the current thread.
-+//
-+// Returns true if parameters are available.
-+bool chiavdf_get_last_streaming_parameters(uint32_t* out_k, uint32_t* out_l, bool* out_tuned);
-+
-+// Enable lightweight timing counters for the streaming prover.
-+//
-+// When enabled, the native library records basic timing counters for the most
-+// recent streaming proof computed on the current thread. This is intended for
-+// benchmarking and tuning; production runs should keep this disabled to avoid
-+// extra overhead.
-+void chiavdf_set_enable_streaming_stats(bool enable);
-+
-+// Debug helper: returns timing counters for the most recent streaming proof on
-+// the current thread.
-+//
-+// Returns true if stats are available (i.e. stats enabled and a streaming proof
-+// was computed successfully).
-+bool chiavdf_get_last_streaming_stats(
-+    uint64_t* out_checkpoint_total_ns,
-+    uint64_t* out_checkpoint_event_total_ns,
-+    uint64_t* out_finalize_total_ns,
-+    uint64_t* out_checkpoint_calls,
-+    uint64_t* out_bucket_updates);
-+
-+// Computes a compact (witness_type=0) Wesolowski proof using the fast engine.
-+//
-+// On success, returns `y || proof` where:
-+// - `y` is the serialized output form (typically 100 bytes for 1024-bit discriminants)
-+// - `proof` is the serialized witness form (same size as `y`)
-+//
-+// On failure, returns `{NULL, 0}`.
-+ChiavdfByteArray chiavdf_prove_one_weso_fast(
-+    const uint8_t* challenge_hash,
-+    size_t challenge_size,
-+    const uint8_t* x_s,
-+    size_t x_s_size,
-+    size_t discriminant_size_bits,
-+    uint64_t num_iterations);
-+
-+// Same as `chiavdf_prove_one_weso_fast`, but optionally invokes `progress_cb` from
-+// the proving thread every `progress_interval` iterations completed.
-+//
-+// If `progress_cb` is NULL or `progress_interval` is 0, no progress is reported.
-+ChiavdfByteArray chiavdf_prove_one_weso_fast_with_progress(
-+    const uint8_t* challenge_hash,
-+    size_t challenge_size,
-+    const uint8_t* x_s,
-+    size_t x_s_size,
-+    size_t discriminant_size_bits,
-+    uint64_t num_iterations,
-+    uint64_t progress_interval,
-+    ChiavdfProgressCallback progress_cb,
-+    void* progress_user_data);
-+
-+// Computes a compact (witness_type=0) Wesolowski proof using the "streaming"
-+// bucket-accumulation algorithm (Trick 1), which requires the expected output
-+// `y_ref` up front (as used by bluebox compaction jobs).
-+//
-+// On success, returns `y || proof` (same format as `chiavdf_prove_one_weso_fast`).
-+ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming(
-+    const uint8_t* challenge_hash,
-+    size_t challenge_size,
-+    const uint8_t* x_s,
-+    size_t x_s_size,
-+    const uint8_t* y_ref_s,
-+    size_t y_ref_s_size,
-+    size_t discriminant_size_bits,
-+    uint64_t num_iterations);
-+
-+// Same as `chiavdf_prove_one_weso_fast_streaming`, but optionally invokes
-+// `progress_cb` from the proving thread every `progress_interval` iterations.
-+ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_with_progress(
-+    const uint8_t* challenge_hash,
-+    size_t challenge_size,
-+    const uint8_t* x_s,
-+    size_t x_s_size,
-+    const uint8_t* y_ref_s,
-+    size_t y_ref_s_size,
-+    size_t discriminant_size_bits,
-+    uint64_t num_iterations,
-+    uint64_t progress_interval,
-+    ChiavdfProgressCallback progress_cb,
-+    void* progress_user_data);
-+
-+// Same as `chiavdf_prove_one_weso_fast_streaming`, but with an optimized
-+// implementation of the `GetBlock()` mapping (avoids per-block modular
-+// exponentiation without allocating a full `GetBlock` table).
-+ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt(
-+    const uint8_t* challenge_hash,
-+    size_t challenge_size,
-+    const uint8_t* x_s,
-+    size_t x_s_size,
-+    const uint8_t* y_ref_s,
-+    size_t y_ref_s_size,
-+    size_t discriminant_size_bits,
-+    uint64_t num_iterations);
-+
-+// Same as `chiavdf_prove_one_weso_fast_streaming_getblock_opt`, but optionally
-+// invokes `progress_cb` from the proving thread every `progress_interval`
-+// iterations.
-+ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt_with_progress(
-+    const uint8_t* challenge_hash,
-+    size_t challenge_size,
-+    const uint8_t* x_s,
-+    size_t x_s_size,
-+    const uint8_t* y_ref_s,
-+    size_t y_ref_s_size,
-+    size_t discriminant_size_bits,
-+    uint64_t num_iterations,
-+    uint64_t progress_interval,
-+    ChiavdfProgressCallback progress_cb,
-+    void* progress_user_data);
-+
-+void chiavdf_free_byte_array(ChiavdfByteArray array);
-+
-+#ifdef __cplusplus
-+}
-+#endif
-diff --git a/src/threading.h b/src/threading.h
-index 50d4b49..f6344ad 100644
---- a/src/threading.h
-+++ b/src/threading.h
-@@ -564,8 +564,8 @@ struct alignas(64) thread_counter {
-     }
- };
- 
--thread_counter master_counter[100];
--thread_counter slave_counter[100];
-+thread_counter master_counter[512];
-+thread_counter slave_counter[512];
- 
- struct thread_state {
-     int pairindex;
-diff --git a/src/vdf.h b/src/vdf.h
-index 9ab4aef..4544fe2 100644
---- a/src/vdf.h
-+++ b/src/vdf.h
-@@ -78,6 +78,18 @@ std::mutex new_event_mutex, cout_lock;
- bool debug_mode = false;
- bool fast_algorithm = false;
- bool two_weso = false;
-+bool quiet_mode = false;
-+
-+// vdf_fast uses shared master/slave counters keyed by `square_state.pairindex`.
-+// The upstream chiavdf binaries run one VDF per process and hardcode `pairindex=0`.
-+// In embedded/multi-worker setups (like WesoForge), multiple VDF computations can
-+// run concurrently in the same process; they must not share a pairindex.
-+inline int vdf_fast_pairindex() {
-+    constexpr int kSlots = int(sizeof(master_counter) / sizeof(master_counter[0]));
-+    static std::atomic<int> next_slot{0};
-+    thread_local int slot = next_slot.fetch_add(1, std::memory_order_relaxed) % kSlots;
-+    return slot;
-+}
- 
- //always works
- void repeated_square_original(vdf_original &vdfo, form& f, const integer& D, const integer& L, uint64 base, uint64 iterations, INUDUPLListener *nuduplListener) {
-@@ -137,7 +149,7 @@ void repeated_square(uint64_t iterations, form f, const integer& D, const intege
- 
-         // This works single threaded
-         square_state_type square_state;
--        square_state.pairindex=0;
-+        square_state.pairindex=vdf_fast_pairindex();
- 
-         uint64 actual_iterations=repeated_square_fast(square_state, f, D, L, num_iterations, batch_size, weso);
- 
-@@ -236,10 +248,12 @@ void repeated_square(uint64_t iterations, form f, const integer& D, const intege
-             }
-         #endif
-     }
--    {
--        // this shouldn't be needed but avoids some false positive in TSAN
--        std::lock_guard<std::mutex> lk(cout_lock);
--        std::cout << "VDF loop finished. Total iters: " << num_iterations << "\n" << std::flush;
-+    if (!quiet_mode) {
-+        {
-+            // this shouldn't be needed but avoids some false positive in TSAN
-+            std::lock_guard<std::mutex> lk(cout_lock);
-+            std::cout << "VDF loop finished. Total iters: " << num_iterations << "\n" << std::flush;
-+        }
-     }
- 
-     #ifdef VDF_TEST
-@@ -275,11 +289,6 @@ Proof ProveOneWesolowski(uint64_t iters, integer& D, form f, OneWesolowskiCallba
-     proof_serialized = SerializeForm(proof_form, d_bits);
-     Proof proof(y_serialized, proof_serialized);
-     proof.witness_type = 0;
--    {
--        // this shouldn't be needed but avoids some false positive in TSAN
--        std::lock_guard<std::mutex> lk(cout_lock);
--        std::cout << "Got simple weso proof: " << proof.hex() << "\n";
--    }
-     return proof;
- }
- 
-diff --git a/docs/bluebox_compaction.md b/docs/bluebox_compaction.md
-new file mode 100644
-index 0000000..61cd1fd
---- /dev/null
-+++ b/docs/bluebox_compaction.md
-@@ -0,0 +1,49 @@
-+# Bluebox Compaction Optimizations
-+
-+This document describes the compaction-oriented proving path exposed by
-+`src/c_bindings/fast_wrapper.h` and implemented in
-+`src/c_bindings/fast_wrapper.cpp`.
-+
-+## Scope
-+
-+These APIs are intended for workloads where the expected VDF output (`y_ref`) is
-+already known up front (for example, bluebox compaction jobs). They are additive
-+and do not change the existing `c_wrapper` APIs.
-+
-+## Optimization 1: Streaming one-wesolowski
-+
-+Given `y_ref`, the prover computes:
-+
-+- `B = GetB(D, x, y_ref)` before squaring starts
-+
-+This enables a streaming algorithm that updates proof buckets at each
-+checkpoint during repeated squaring, instead of materializing the full
-+intermediate checkpoint array and scanning it after the loop. In practice this
-+substantially reduces memory usage for compaction workloads.
-+
-+## Optimization 2: Incremental GetBlock mapping
-+
-+For streaming checkpoint updates, bucket index selection repeatedly calls
-+`GetBlock(p, k, T, B)`. The optimized mode keeps a rolling modular state and
-+advances sequential `p` values incrementally, avoiding full modular
-+exponentiation per call and avoiding a large lookup table.
-+
-+## Optimization 3: Memory-budgeted (k, l) tuning
-+
-+The wrapper can tune `(k, l)` under a configured memory budget:
-+
-+- `chiavdf_set_bucket_memory_budget_bytes(...)`
-+
-+If no tuned candidate is found, the code falls back to the standard parameter
-+heuristics.
-+
-+## Operational Notes
-+
-+- The `fast_wrapper` code path sets one-wesolowski mode and uses `quiet_mode` to
-+  avoid unsolicited stdout noise when embedded in multi-worker clients.
-+- Thread-slot assignment for the fast VDF counters is per-thread via
-+  `vdf_fast_pairindex()`, avoiding slot collisions when multiple VDF computations
-+  run in one process.
-+- The production default for `enable_threads` in `parameters.h` is unchanged from
-+  upstream to preserve timelord expectations.
-+
diff --git a/src/c_bindings/fast_wrapper.cpp b/src/c_bindings/fast_wrapper.cpp
index af3bf805..ae834c84 100644
--- a/src/c_bindings/fast_wrapper.cpp
+++ b/src/c_bindings/fast_wrapper.cpp
@@ -169,11 +169,12 @@ bool tune_streaming_parameters(
                 continue;
             }
 
-            unsigned __int128 updates = static_cast<unsigned __int128>(
-                (num_iterations + static_cast<uint64_t>(k) - 1) / static_cast<uint64_t>(k));
             uint64_t kl = static_cast<uint64_t>(k) * static_cast<uint64_t>(l);
             unsigned __int128 checkpoints = static_cast<unsigned __int128>(
                 (num_iterations + kl - 1) / kl);
+            // Each checkpoint can trigger up to `l` bucket updates (one per sub-block).
+            // Model update work as checkpoint-count scaled by `l`.
+            unsigned __int128 updates = checkpoints * static_cast<unsigned __int128>(l);
             unsigned __int128 fold = static_cast<unsigned __int128>(l) << (k + 1);
             unsigned __int128 cost =
                 updates * update_weight + checkpoints * checkpoint_weight + fold * fold_weight;

From 61e9280ead4d4cd19caf8f0984b4ab562730ad6a Mon Sep 17 00:00:00 2001
From: Gene Hoffman <hoffmang@hoffmang.com>
Date: Tue, 12 May 2026 20:08:54 -0700
Subject: [PATCH 12/21] Adapt streaming callback checkpoint scheduling from
 fb0e2c2.

Replace per-iteration modulo checks with next-checkpoint tracking in the streaming callback, and integrate the scheduling update with batch replay boundaries so rollback/replay semantics remain correct in the current upstreamed implementation.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 src/c_bindings/fast_wrapper.cpp | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/src/c_bindings/fast_wrapper.cpp b/src/c_bindings/fast_wrapper.cpp
index ae834c84..c3351e50 100644
--- a/src/c_bindings/fast_wrapper.cpp
+++ b/src/c_bindings/fast_wrapper.cpp
@@ -295,6 +295,7 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
           progress_cb(progress_cb),
           progress_user_data(progress_user_data),
           next_progress(progress_interval),
+          next_checkpoint_t((limit <= 1 || kl == 0) ? std::numeric_limits<uint64_t>::max() : kl),
           use_getblock_opt(use_getblock_opt),
           stats_enabled(streaming_stats_enabled.load(std::memory_order_relaxed)) {
         form id = form::identity(D);
@@ -328,7 +329,7 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
             next_progress += progress_interval;
         }
 
-        if (iteration % kl == 0) {
+        if (iteration == next_checkpoint_t) {
             uint64_t pos = iteration / kl;
             if (pos < limit) {
                 form checkpoint;
@@ -348,6 +349,14 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
                             .count());
                 }
             }
+
+            const uint64_t next_pos = pos + 1;
+            if (next_pos < limit && kl != 0 &&
+                next_pos <= std::numeric_limits<uint64_t>::max() / kl) {
+                next_checkpoint_t = next_pos * kl;
+            } else {
+                next_checkpoint_t = std::numeric_limits<uint64_t>::max();
+            }
         }
 
         if (iteration == wanted_iter) {
@@ -361,6 +370,7 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
         if (batch_size == 0) {
             batch_start_iteration = 1;
             batch_end_iteration = 0;
+            next_checkpoint_t = std::numeric_limits<uint64_t>::max();
             return;
         }
         // `base_iteration` is the number of completed iterations before this batch.
@@ -371,6 +381,24 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
         } else {
             batch_end_iteration = base_iteration + batch_size;
         }
+
+        if (kl == 0 || limit <= 1) {
+            next_checkpoint_t = std::numeric_limits<uint64_t>::max();
+            return;
+        }
+
+        const uint64_t first_iteration = saturating_add_u64(base_iteration, 1);
+        const uint64_t numerator = saturating_add_u64(first_iteration, kl - 1);
+        uint64_t first_pos = numerator / kl;
+        if (first_pos == 0) {
+            first_pos = 1;
+        }
+
+        if (first_pos < limit && first_pos <= std::numeric_limits<uint64_t>::max() / kl) {
+            next_checkpoint_t = first_pos * kl;
+        } else {
+            next_checkpoint_t = std::numeric_limits<uint64_t>::max();
+        }
     }
 
     void OnBatchReplay(uint64_t base_iteration, uint64_t batch_size) override {
@@ -524,6 +552,7 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
     ChiavdfProgressCallback progress_cb;
     void* progress_user_data;
     uint64_t next_progress;
+    uint64_t next_checkpoint_t = std::numeric_limits<uint64_t>::max();
     size_t bucket_span = 0;
 
     std::vector<form> buckets;

From 91a2af96cd83525c20badf67a53727daccf657db Mon Sep 17 00:00:00 2001
From: Gene Hoffman <hoffmang@hoffmang.com>
Date: Tue, 12 May 2026 20:35:42 -0700
Subject: [PATCH 13/21] Address slot reuse and logging consistency in vdf fast
 path.

Lease fast counter slots with per-slot in-use tracking so long-lived processes can recycle released slots safely, and restore the one-weso proof diagnostic behind quiet_mode to keep client logging behavior consistent.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 src/vdf.h | 49 +++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 45 insertions(+), 4 deletions(-)

diff --git a/src/vdf.h b/src/vdf.h
index eb1d0d39..92a56b78 100644
--- a/src/vdf.h
+++ b/src/vdf.h
@@ -94,8 +94,6 @@ bool quiet_mode = false;
 // In embedded/multi-worker setups (like WesoForge), multiple VDF computations can
 // run concurrently in the same process; they must not share a pairindex.
 #if (defined(ARCH_X86) || defined(ARCH_X64)) && !defined(CHIA_DISABLE_ASM)
-// Keep slot allocation state as one program-wide entity for all TUs that include
-// this header, so concurrent callers cannot recycle the same slot sequence.
 inline std::atomic<unsigned int> vdf_fast_next_slot{0};
 #endif
 
@@ -103,8 +101,45 @@ inline int vdf_fast_pairindex() {
 #if (defined(ARCH_X86) || defined(ARCH_X64)) && !defined(CHIA_DISABLE_ASM)
     constexpr unsigned int kSlots = unsigned(sizeof(master_counter) / sizeof(master_counter[0]));
     static_assert(kSlots > 0, "CHIA_VDF_FAST_COUNTER_SLOTS must be > 0");
-    thread_local int slot = int(vdf_fast_next_slot.fetch_add(1u, std::memory_order_relaxed) % kSlots);
-    return slot;
+    static std::array<std::atomic<bool>, kSlots> vdf_fast_slot_in_use{};
+    struct SlotLease {
+        std::array<std::atomic<bool>, kSlots>* slots = nullptr;
+        int slot = -1;
+        bool owns_slot = false;
+        ~SlotLease() {
+            if (owns_slot && slots != nullptr && slot >= 0) {
+                (*slots)[static_cast<size_t>(slot)].store(false, std::memory_order_release);
+            }
+        }
+    };
+
+    thread_local SlotLease lease;
+    if (lease.slot >= 0) {
+        return lease.slot;
+    }
+
+    lease.slots = &vdf_fast_slot_in_use;
+
+    const unsigned int start = vdf_fast_next_slot.fetch_add(1u, std::memory_order_relaxed);
+    for (unsigned int i = 0; i < kSlots; i++) {
+        const unsigned int candidate = (start + i) % kSlots;
+        bool expected = false;
+        if (vdf_fast_slot_in_use[candidate].compare_exchange_strong(
+                expected,
+                true,
+                std::memory_order_acq_rel,
+                std::memory_order_relaxed)) {
+            lease.slot = static_cast<int>(candidate);
+            lease.owns_slot = true;
+            return lease.slot;
+        }
+    }
+
+    // All slots are currently active. Reuse one as a best-effort fallback; the
+    // fast path has corruption detection and can fall back to slow squaring.
+    lease.slot = static_cast<int>(start % kSlots);
+    lease.owns_slot = false;
+    return lease.slot;
 #else
     return 0;
 #endif
@@ -367,6 +402,12 @@ Proof ProveOneWesolowski(uint64_t iters, integer& D, form f, OneWesolowskiCallba
     proof_serialized = SerializeForm(proof_form, d_bits);
     Proof proof(y_serialized, proof_serialized);
     proof.witness_type = 0;
+    if (!quiet_mode) {
+        // Keep proof diagnostics available for vdf_client while quiet_mode
+        // suppresses output in embedded library-mode call paths.
+        std::lock_guard<std::mutex> lk(cout_lock);
+        std::cout << "Got simple weso proof: " << proof.hex() << "\n";
+    }
     return proof;
 }
 

From 1e19548e9959455e6817dcaadb36009c421ddc72 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aur=C3=A9lien=20Mora?= <ealrann@gmail.com>
Date: Wed, 21 Jan 2026 17:46:30 +0100
Subject: [PATCH 14/21] trick 2

(cherry picked from commit f3c73bf046e155aad2a8a9417496550b1e2d8cd5)
---
 BBR_BLUEBOX_COMPACTION_OVERVIEW.md | 308 +++++++++++++
 src/c_bindings/fast_wrapper.cpp    | 713 ++++++++++++++++++++---------
 src/c_bindings/fast_wrapper.h      |  26 +-
 3 files changed, 818 insertions(+), 229 deletions(-)
 create mode 100644 BBR_BLUEBOX_COMPACTION_OVERVIEW.md

diff --git a/BBR_BLUEBOX_COMPACTION_OVERVIEW.md b/BBR_BLUEBOX_COMPACTION_OVERVIEW.md
new file mode 100644
index 00000000..ba15ac86
--- /dev/null
+++ b/BBR_BLUEBOX_COMPACTION_OVERVIEW.md
@@ -0,0 +1,308 @@
+# BBR / Chia Bluebox VDF Proof Compaction — Overview + Implemented Performance Tricks
+
+This document summarizes how Chia “bluebox” compaction jobs are computed in this repo, and the performance tweaks we implemented in `bbr_chiavdf/` (a fork/copy of `chiavdf/`).
+
+It is written to be understandable even if you’re not already fluent with Chia’s VDF / classgroup implementation details.
+
+## 1) What “bluebox compaction” computes
+
+Chia blocks include a VDF output (“VDFInfo”) for some VDF “slot” (end-of-slot, signage point, infusion point, etc.). A full node can accept a **compact proof of time** (“VDFProof”) for a given VDFInfo:
+
+- Input element `x` (for compaction jobs this is always the **default/canonical classgroup element**, i.e. identity-ish input used by Chia)
+- Discriminant `D` derived from the VDF challenge
+- Number of iterations `T`
+- Output element `y` (already known from the block’s VDFInfo)
+
+The bluebox worker’s job is to compute the **compact Wesolowski witness** `π` (witness_type = 0) such that the proof verifies for `(D, x, y, T)`.
+
+### Inputs (per proof job)
+
+All values are byte strings unless noted otherwise.
+
+- `challenge`: 32 bytes (`VDFInfo.challenge`)
+- `T`: `u64` (`VDFInfo.number_of_iterations`)
+- `y_ref`: 100 bytes serialized classgroup element (`VDFInfo.output`)
+- `size_bits`: discriminant size in bits (typically 1024; Chia consensus constant)
+- `x0`: canonical input element for compaction:
+  - `x0_bytes = ClassgroupElement.get_default_element().data` (100 bytes)
+
+### Outputs (per proof job)
+
+- `y`: serialized output element (should equal `y_ref`)
+- `proof`: serialized witness element `π` (same size as `y`)
+- In our C ABI wrappers we return `y || proof` (concatenation, typically 200 bytes for 1024-bit discriminants).
+
+## 2) Underlying primitives (high-level)
+
+Chia’s VDF uses the class group of binary quadratic forms for a negative discriminant `D`:
+
+- `D` is derived deterministically from `(challenge, size_bits)` via `CreateDiscriminant(...)`.
+- Elements are represented as reduced forms `form { a, b, c }` (each is a GMP big integer).
+- The VDF evaluation is the deterministic repeated squaring chain:
+  - `f(0) = x0`
+  - `f(t+1) = square(f(t))` (with reduction)
+  - `y = f(T) = x0^(2^T)` in the class group
+
+The compact proof is a Wesolowski proof, which (in this implementation) uses a per-proof prime `B` derived from the input and output:
+
+- `B = GetB(D, x0_form, y_ref_form)` where `GetB` hashes serialized forms then runs `HashPrime(...)`.
+- Because `B` depends on `y_ref`, if `y_ref` is known up front then **`B` is known before squaring starts**.
+
+## 3) Baseline chiavdf “one-weso” compaction (two-phase)
+
+The upstream chiavdf compact witness path (“one-weso”) is:
+
+1. **Squaring phase**
+   - Run the VDF evaluation (sequential squaring) from `x0` to iteration `T`.
+   - Store many intermediate forms (“checkpoints”) in an array at a fixed cadence.
+
+2. **Proof phase**
+   - After squaring is finished, scan those stored checkpoints.
+   - Multiply them into “buckets” `ys[j][b]` using a mapping (`GetBlock`) that depends on `B`.
+   - Fold the bucket structure into a final `proof_form`.
+
+### Proof parameters `k, l, kl`
+
+chiavdf computes parameters:
+
+- `(k, l) = ApproximateParameters(T)`
+- `kl = k * l`
+- Number of checkpoint indices:
+  - `limit = ceil(T / kl)` (the number of checkpoint positions that may be used)
+
+These parameters control how many checkpoints are used and how bucket folding is structured.
+
+### Costs (baseline)
+
+- Memory: stores `O(ceil(T/kl))` checkpoint forms (each form holds several GMP big integers).
+- Time: wall-clock is essentially `t_total = t_square + t_proof` because proof work happens after squaring.
+
+## 4) Tweak / Trick 1 — “Streaming one-weso” using known output (`y_ref`)
+
+### Key idea
+
+For bluebox compaction, `y_ref` is already known from the block. Because `B` depends on `y_ref`, we can compute `B` before starting squaring.
+
+That lets us avoid storing checkpoint forms and instead update the proof buckets **as soon as each checkpoint is reached**, using the current `f(t)` value.
+
+### Algorithm (single job, streaming buckets)
+
+Inputs: `(challenge, size_bits, x0_bytes, y_ref_bytes, T)`
+
+1. Compute `D = CreateDiscriminant(challenge, size_bits)` and `L = root(-D, 4)` (chiavdf convention).
+2. Deserialize:
+   - `x0_form = DeserializeForm(D, x0_bytes)`
+   - `y_ref_form = DeserializeForm(D, y_ref_bytes)`
+3. Compute:
+   - `B = GetB(D, x0_form, y_ref_form)`
+   - `(k, l) = ApproximateParameters(T)` (fallback `k=10,l=1` for small `T`)
+   - `kl = k*l`
+   - `limit = ceil(T/kl)`
+4. Allocate buckets:
+   - `ys[j][b]` for `j ∈ [0, l)` and `b ∈ [0, 2^k)`
+   - Initialize all buckets to the identity form.
+5. Run the VDF squaring chain up to `T`, but:
+   - At each checkpoint time `t = i*kl`, compute `checkpoint = f(t)` and call `process_checkpoint(i, checkpoint)`:
+     - For each `j ∈ [0, l)`:
+       - `p = i*l + j`
+       - If `T >= k*(p+1)`, compute `b = GetBlock(p, k, T, B)`
+       - Multiply `ys[j][b] *= checkpoint` (via `nucomp_form`).
+6. At the end, compute `y = f(T)` and check `y == y_ref_form` (debug/safety guard).
+7. Fold buckets to compute the final proof form (same folding logic as chiavdf).
+8. Serialize `y` and `proof` and return `y || proof`.
+
+### What changed vs baseline
+
+- We no longer store an array of checkpoint forms.
+- Bucket multiplication occurs “online” during squaring.
+- Folding/finalization stays the same as chiavdf.
+
+### Costs / tradeoffs
+
+- Memory becomes `O(l * 2^k)` forms (the bucket table) instead of `O(ceil(T/kl))` checkpoint forms.
+- Runtime can sometimes overlap bucket updates with squaring, but in practice the speedup depends on which part dominates (squaring vs `nucomp_form` multiplications).
+
+### Where this lives in `bbr_chiavdf/`
+
+- C ABI entrypoints:
+  - `chiavdf_prove_one_weso_fast_streaming(...)`
+  - `chiavdf_prove_one_weso_fast_streaming_with_progress(...)`
+- Implementation:
+  - `bbr_chiavdf/src/c_bindings/fast_wrapper.cpp`
+  - `StreamingOneWesolowskiCallback` and the bucket helper (`StreamingWesolowskiBuckets`).
+
+## 5) GetBlock optimization (precompute `GetBlock(p)` table per job)
+
+In streaming (and in baseline), for each checkpoint update we need:
+
+- `b = GetBlock(p, k, T, B)`
+
+Naively this uses per-`p` modular exponentiation and division, which is expensive with GMP big integers.
+
+### Optimization idea
+
+For fixed `(T, k, B)`, define:
+
+- `r_p = 2^{T - k*(p+1)} mod B`
+- `b_p = floor((r_p * 2^k) / B)` (integer division)
+
+Then:
+
+- `r_{p+1} = r_p * inv(2^k) mod B` where `inv(2^k)` is the modular inverse of `2^k mod B`
+
+So we can compute all `b_p` iteratively in `O(#p)` time with one modular inverse, instead of `O(#p)` modular exponentiations.
+
+### Tradeoff
+
+We store `precomputed_blocks[p]` for all `p` used by the proof:
+
+- Memory: `O(limit * l)` `u32` values per job.
+  - For typical compaction-scale `T` this is often a few MB per job.
+
+### Where this lives
+
+- `bbr_chiavdf/src/c_bindings/fast_wrapper.cpp`:
+  - `build_precomputed_getblocks(...)`
+  - Used by:
+    - `chiavdf_prove_one_weso_fast_streaming_getblock_opt(...)`
+    - `chiavdf_prove_one_weso_fast_streaming_getblock_opt_with_progress(...)`
+
+## 6) Trick 2 — discriminant reuse (“multi-target VDF engine”)
+
+### Key observation
+
+For a fixed group key `(challenge, size_bits, x0_bytes)`, the discriminant `D` and the entire squaring trajectory `f(t)` are identical for all jobs:
+
+- Only `T_j` and `y_ref_j` differ across jobs.
+
+Therefore, if you have `N` jobs sharing a group key:
+
+- Without reuse: total squaring work is `Σ T_j`
+- With reuse: total squaring work is exactly `T_max = max(T_j)`
+
+### Grouping key
+
+Jobs can be grouped if and only if:
+
+- Same `challenge`
+- Same `size_bits`
+- Same `x0_bytes`
+
+For bluebox compaction, `x0_bytes` is always the default element, so grouping is mostly “same challenge”.
+
+### Algorithm (batch)
+
+Inputs (shared):
+
+- `challenge`, `x0_bytes`, `size_bits`
+
+Inputs (per job `j`):
+
+- `T_j`, `y_ref_j`
+
+Per job setup (done before squaring starts):
+
+1. Deserialize `y_ref_form_j`
+2. Compute `B_j = GetB(D, x0_form, y_ref_form_j)`
+3. Compute `(k_j, l_j)`, `kl_j`, `limit_j`
+4. Allocate `ys_j` buckets (Trick 1)
+5. Precompute `GetBlock` table for that job (GetBlock opt)
+
+Shared squaring run:
+
+- Run `repeated_square(T_max, ...)` once to generate `f(t)` for all times up to `T_max`.
+- Maintain per job:
+  - `next_checkpoint_t_j` initialized to `kl_j` (we process `i=0` immediately at `t=0`)
+  - completion time `T_j`
+- At each “event time” `t`:
+  1. For every job where `t == next_checkpoint_t_j`:
+     - `i = t / kl_j`
+     - `ys_j` bucket update with checkpoint form `f(t)` (Trick 1)
+     - `next_checkpoint_t_j += kl_j`
+  2. For every job where `t == T_j`:
+     - Debug check: `f(T_j) == y_ref_form_j`
+       - If mismatch: abort (signals backend grouping/data bug).
+     - Finalize proof for that job (fold buckets → proof form) and serialize result.
+     - Free job state (buckets, GetBlock table) to reduce peak RAM.
+
+### Concurrency / offloading finalization
+
+- The shared squaring chain itself is sequential by definition.
+- Bucket updates are triggered by exact `f(t)` values; in our implementation they are done on the squaring callback thread to avoid copying forms or storing a large checkpoint history.
+- Finalization (folding + serialization) is **per job** and can be offloaded:
+  - Once a job reaches `T_j` and passes the `f(T_j)==y_ref_j` check, its proof no longer depends on future squaring.
+  - We offload finalization to a `std::thread` per completed job so the squaring run can continue toward larger `T`.
+
+### Where this lives
+
+- New C ABI:
+  - `bbr_chiavdf/src/c_bindings/fast_wrapper.h`:
+    - `ChiavdfBatchJob`
+    - `chiavdf_prove_one_weso_fast_streaming_getblock_opt_batch(...)`
+    - `chiavdf_free_byte_array_batch(...)`
+- Implementation:
+  - `bbr_chiavdf/src/c_bindings/fast_wrapper.cpp`:
+    - `BatchOneWesolowskiCallback`
+    - `BatchJobState`
+    - Uses `StreamingWesolowskiBuckets` per job
+
+### Error policy (mismatch)
+
+We keep a strict mismatch check specifically for debugging backend grouping / job data issues:
+
+- If the computed checkpoint `f(T_j)` differs from `y_ref_form_j`, the batch function returns `NULL` (fatal error).
+
+This is expected to be “should never happen” in normal operation, but is useful to detect wrong grouping inputs early.
+
+## 7) Rough resource model (what consumes time and RAM)
+
+### Time
+
+Three main contributors:
+
+1. **Squaring chain** (`repeated_square(...)`): inherently sequential per group.
+2. **Bucket updates**: `nucomp_form` multiplications at checkpoint times; scales with number of jobs and number of checkpoints.
+3. **Finalization**: folding buckets into a proof; per job.
+
+Trick 2 reduces (1) across jobs by reusing squaring work.
+
+### Memory (per job, within a group)
+
+Dominant memory terms:
+
+- Buckets: `l * 2^k` forms (each form holds multiple GMP big ints) — often several MB per job.
+- GetBlock precompute: `limit * l` `u32` values — often a few MB per job.
+
+Peak memory per group is roughly linear in the number of jobs active at the same time (and drops as jobs complete and are freed).
+
+## 8) Things to look at next (possible improvement areas)
+
+This section is intentionally a “menu” for further investigation.
+
+1. **Hotspots inside classgroup arithmetic**
+   - If perf shows most time in `nucomp_form` / GMP, then:
+     - reduce allocations (GMP mpz churn) with pooling or reuse
+     - explore alternative big-int backends / tuned GMP build / CPU-specific flags
+     - reduce constant factors in `nucomp_form` (algorithmic / assembly improvements)
+
+2. **Reduce per-iteration callback overhead**
+   - Today `OnIteration` is called for every iteration, even though we only act on sparse “event times”.
+   - If this overhead becomes visible at huge `T`, consider:
+     - extending the core loop to support “next event” iteration skipping (intrusive change)
+     - or internal batching in the callback path
+
+3. **Finalization optimization**
+   - Each job finalization constructs a reducer and folds buckets.
+   - Potential wins:
+     - reuse reducers per thread
+     - reduce intermediate `form` temporaries and copies
+
+4. **Group sizing / scheduling**
+   - For Trick 2, there’s a throughput vs RAM tradeoff.
+   - Consider dynamic group size based on memory budget and `T` distribution.
+
+5. **Optional: parallelize bucket updates (hard)**
+   - Bucket updates need the checkpoint form `f(t)` at exact times.
+   - Parallelizing this without copying/storing forms requires careful design (e.g. immutable snapshots, reference counting, or storing a checkpoint history).
+   - This is the next “big step” if per-job proof work becomes the bottleneck even after squaring reuse.
+
diff --git a/src/c_bindings/fast_wrapper.cpp b/src/c_bindings/fast_wrapper.cpp
index c3351e50..d3ffef65 100644
--- a/src/c_bindings/fast_wrapper.cpp
+++ b/src/c_bindings/fast_wrapper.cpp
@@ -1,11 +1,13 @@
 #include "fast_wrapper.h"
 
+#include <algorithm>
 #include <atomic>
 #include <cassert>
 #include <chrono>
 #include <cstdio>
 #include <limits>
 #include <mutex>
+#include <thread>
 #include <vector>
 
 #include "../vdf.h"
@@ -271,31 +273,25 @@ class ProgressOneWesolowskiCallback final : public OneWesolowskiCallback {
     uint64_t next_progress;
 };
 
-class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
+class StreamingWesolowskiBuckets {
   public:
-    StreamingOneWesolowskiCallback(
+    StreamingWesolowskiBuckets(
         integer& D,
+        integer& L,
         uint64_t wanted_iter,
         uint32_t k,
         uint32_t l,
         uint64_t limit,
-        integer& B,
-        bool use_getblock_opt,
-        uint64_t progress_interval,
-        ChiavdfProgressCallback progress_cb,
-        void* progress_user_data)
-        : WesolowskiCallback(D),
+        integer B,
+        bool use_getblock_opt)
+        : D(D),
+          L(L),
           wanted_iter(wanted_iter),
           k(k),
           l(l),
           kl(static_cast<uint64_t>(k) * static_cast<uint64_t>(l)),
           limit(limit),
-          B(B),
-          progress_interval(progress_interval),
-          progress_cb(progress_cb),
-          progress_user_data(progress_user_data),
-          next_progress(progress_interval),
-          next_checkpoint_t((limit <= 1 || kl == 0) ? std::numeric_limits<uint64_t>::max() : kl),
+          B(std::move(B)),
           use_getblock_opt(use_getblock_opt),
           stats_enabled(streaming_stats_enabled.load(std::memory_order_relaxed)) {
         form id = form::identity(D);
@@ -318,116 +314,17 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
         }
     }
 
-    void OnIteration(int type, void* data, uint64_t iteration) override {
-        iteration++;
-        if (iteration > wanted_iter) {
-            return;
-        }
-
-        if (progress_cb != nullptr && progress_interval != 0 && iteration >= next_progress) {
-            progress_cb(next_progress, progress_user_data);
-            next_progress += progress_interval;
-        }
-
-        if (iteration == next_checkpoint_t) {
-            uint64_t pos = iteration / kl;
-            if (pos < limit) {
-                form checkpoint;
-                auto started_at = std::chrono::steady_clock::time_point{};
-                if (stats_enabled) {
-                    started_at = std::chrono::steady_clock::now();
-                }
-                SetForm(type, data, &checkpoint);
-                process_checkpoint(pos, checkpoint, /*record_stats=*/true);
-                if (iteration >= batch_start_iteration && iteration <= batch_end_iteration) {
-                    current_batch_checkpoints.push_back(BatchCheckpoint{pos, checkpoint});
-                }
-                if (stats_enabled) {
-                    checkpoint_event_total_ns += static_cast<uint64_t>(
-                        std::chrono::duration_cast<std::chrono::nanoseconds>(
-                            std::chrono::steady_clock::now() - started_at)
-                            .count());
-                }
-            }
+    uint64_t wanted_iterations() const { return wanted_iter; }
 
-            const uint64_t next_pos = pos + 1;
-            if (next_pos < limit && kl != 0 &&
-                next_pos <= std::numeric_limits<uint64_t>::max() / kl) {
-                next_checkpoint_t = next_pos * kl;
-            } else {
-                next_checkpoint_t = std::numeric_limits<uint64_t>::max();
-            }
-        }
+	    uint64_t checkpoint_stride() const { return kl; }
 
-        if (iteration == wanted_iter) {
-            SetForm(type, data, &result);
-            has_result = true;
-        }
-    }
-
-    void OnBatchStart(uint64_t base_iteration, uint64_t batch_size) override {
-        current_batch_checkpoints.clear();
-        if (batch_size == 0) {
-            batch_start_iteration = 1;
-            batch_end_iteration = 0;
-            next_checkpoint_t = std::numeric_limits<uint64_t>::max();
-            return;
-        }
-        // `base_iteration` is the number of completed iterations before this batch.
-        // `OnIteration` normalizes to 1-based (`iteration++`), so this batch is [base+1, base+size].
-        batch_start_iteration = base_iteration + 1;
-        if (std::numeric_limits<uint64_t>::max() - base_iteration < batch_size) {
-            batch_end_iteration = std::numeric_limits<uint64_t>::max();
-        } else {
-            batch_end_iteration = base_iteration + batch_size;
-        }
-
-        if (kl == 0 || limit <= 1) {
-            next_checkpoint_t = std::numeric_limits<uint64_t>::max();
-            return;
-        }
+	    uint64_t checkpoint_limit() const { return limit; }
 
-        const uint64_t first_iteration = saturating_add_u64(base_iteration, 1);
-        const uint64_t numerator = saturating_add_u64(first_iteration, kl - 1);
-        uint64_t first_pos = numerator / kl;
-        if (first_pos == 0) {
-            first_pos = 1;
-        }
-
-        if (first_pos < limit && first_pos <= std::numeric_limits<uint64_t>::max() / kl) {
-            next_checkpoint_t = first_pos * kl;
-        } else {
-            next_checkpoint_t = std::numeric_limits<uint64_t>::max();
-        }
-    }
-
-    void OnBatchReplay(uint64_t base_iteration, uint64_t batch_size) override {
-        for (const BatchCheckpoint& entry : current_batch_checkpoints) {
-            rollback_checkpoint(entry.index, entry.checkpoint);
-        }
-        OnBatchStart(base_iteration, batch_size);
-    }
-
-    void process_checkpoint(uint64_t i, const form& checkpoint, bool record_stats) {
-        apply_checkpoint(i, checkpoint, record_stats);
-    }
-
-  private:
-    struct BatchCheckpoint {
-        uint64_t index;
-        form checkpoint;
-    };
-
-    void rollback_checkpoint(uint64_t i, const form& checkpoint) {
-        form inverse_checkpoint = checkpoint.inverse();
-        apply_checkpoint(i, inverse_checkpoint, /*record_stats=*/false);
-    }
-
-    void apply_checkpoint(uint64_t i, const form& checkpoint, bool record_stats) {
-        const bool do_stats = stats_enabled && record_stats;
-        auto started_at = std::chrono::steady_clock::time_point{};
-        if (do_stats) {
-            started_at = std::chrono::steady_clock::now();
+	    void process_checkpoint(uint64_t i, const form& checkpoint, bool record_stats = true) {
+	        const bool do_stats = stats_enabled && record_stats;
+	        auto started_at = std::chrono::steady_clock::time_point{};
+	        if (do_stats) {
+	            started_at = std::chrono::steady_clock::now();
         }
 
         uint64_t local_updates = 0;
@@ -452,23 +349,18 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
                     std::chrono::steady_clock::now() - started_at)
                     .count());
         }
-    }
+	    }
 
-  public:
-    bool init_ok() const { return getblock_ok; }
+	    bool init_ok() const { return getblock_ok; }
 
-    bool ok() const { return has_result; }
+	    form finalize_proof() const {
+	        auto started_at = std::chrono::steady_clock::time_point{};
+	        if (stats_enabled) {
+	            started_at = std::chrono::steady_clock::now();
+	        }
 
-    const form& y() const { return result; }
-
-    form finalize_proof() {
-        auto started_at = std::chrono::steady_clock::time_point{};
-        if (stats_enabled) {
-            started_at = std::chrono::steady_clock::now();
-        }
-
-        PulmarkReducer reducer;
-        form id = form::identity(D);
+	        PulmarkReducer reducer;
+	        form id = form::identity(D);
 
         uint64_t k1 = k / 2;
         uint64_t k0 = k - k1;
@@ -485,8 +377,13 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
 
             for (uint64_t b1 = 0; b1 < span_k1; b1++) {
                 form z = id;
-                for (uint64_t b0 = 0; b0 < span_k0; b0++) {
-                    nucomp_form(z, z, bucket(static_cast<uint32_t>(j), b1 * span_k0 + b0), D, L);
+                for (uint64_t b0 = 0; b0 < (1ULL << k0); b0++) {
+                    nucomp_form(
+                        z,
+                        z,
+                        bucket(static_cast<uint32_t>(j), b1 * (1ULL << k0) + b0),
+                        D,
+                        L);
                 }
                 z = FastPowFormNucomp(
                     z,
@@ -499,8 +396,13 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
 
             for (uint64_t b0 = 0; b0 < span_k0; b0++) {
                 form z = id;
-                for (uint64_t b1 = 0; b1 < span_k1; b1++) {
-                    nucomp_form(z, z, bucket(static_cast<uint32_t>(j), b1 * span_k0 + b0), D, L);
+                for (uint64_t b1 = 0; b1 < (1ULL << k1); b1++) {
+                    nucomp_form(
+                        z,
+                        z,
+                        bucket(static_cast<uint32_t>(j), b1 * (1ULL << k0) + b0),
+                        D,
+                        L);
                 }
                 z = FastPowFormNucomp(z, D, integer(b0), L, reducer);
                 nucomp_form(x, x, z, D, L);
@@ -516,13 +418,19 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
                     .count());
         }
         return x;
-    }
+	    }
 
-    bool stats_ok() const { return stats_enabled; }
+	    bool stats_ok() const { return stats_enabled; }
 
-    LastStreamingStats stats() const {
-        LastStreamingStats out;
-        out.checkpoint_total_ns = checkpoint_total_ns;
+	    void record_checkpoint_event_ns(uint64_t ns) {
+	        if (stats_enabled) {
+	            checkpoint_event_total_ns += ns;
+	        }
+	    }
+
+	    LastStreamingStats stats() const {
+	        LastStreamingStats out;
+	        out.checkpoint_total_ns = checkpoint_total_ns;
         out.checkpoint_event_total_ns = checkpoint_event_total_ns;
         out.finalize_total_ns = finalize_total_ns;
         out.checkpoint_calls = checkpoint_calls;
@@ -542,22 +450,15 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
         return buckets[idx];
     }
 
+    integer& D;
+    integer& L;
     uint64_t wanted_iter;
     uint32_t k;
-    uint32_t l;
-    uint64_t kl;
-    uint64_t limit;
-    integer B;
-    uint64_t progress_interval;
-    ChiavdfProgressCallback progress_cb;
-    void* progress_user_data;
-    uint64_t next_progress;
-    uint64_t next_checkpoint_t = std::numeric_limits<uint64_t>::max();
-    size_t bucket_span = 0;
-
-    std::vector<form> buckets;
-    form result;
-    bool has_result = false;
+	    uint32_t l;
+	    uint64_t kl;
+	    uint64_t limit;
+	    integer B;
+	    std::vector<form> buckets;
 
     bool use_getblock_opt;
     bool getblock_ok = true;
@@ -569,12 +470,12 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
     uint64_t batch_end_iteration = 0;
     std::vector<BatchCheckpoint> current_batch_checkpoints;
 
-    bool stats_enabled;
-    uint64_t checkpoint_total_ns = 0;
-    uint64_t checkpoint_event_total_ns = 0;
-    uint64_t finalize_total_ns = 0;
-    uint64_t checkpoint_calls = 0;
-    uint64_t bucket_updates = 0;
+	    bool stats_enabled;
+	    uint64_t checkpoint_total_ns = 0;
+	    uint64_t checkpoint_event_total_ns = 0;
+	    mutable uint64_t finalize_total_ns = 0;
+	    uint64_t checkpoint_calls = 0;
+	    uint64_t bucket_updates = 0;
 
     bool init_getblock_opt_state() {
         if (k == 0) {
@@ -582,6 +483,7 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
         }
         uint64_t k_u64 = static_cast<uint64_t>(k);
         if (wanted_iter < k_u64) {
+            getblock_next_p = 0;
             return true;
         }
 
@@ -623,6 +525,284 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
     }
 };
 
+class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
+  public:
+    StreamingOneWesolowskiCallback(
+        integer& discriminant,
+        uint64_t wanted_iter,
+        uint32_t k,
+        uint32_t l,
+        uint64_t limit,
+        integer B,
+        bool use_getblock_opt,
+        uint64_t progress_interval,
+        ChiavdfProgressCallback progress_cb,
+        void* progress_user_data)
+        : WesolowskiCallback(discriminant),
+          buckets(
+              this->D,
+              this->L,
+              wanted_iter,
+              k,
+              l,
+              limit,
+              std::move(B),
+              use_getblock_opt),
+          progress_interval(progress_interval),
+          progress_cb(progress_cb),
+          progress_user_data(progress_user_data),
+          next_progress(progress_interval) {}
+
+    bool init_ok() const { return buckets.init_ok(); }
+
+    void OnIteration(int type, void* data, uint64_t iteration) override {
+        iteration++;
+        if (iteration > buckets.wanted_iterations()) {
+            return;
+        }
+
+        if (progress_cb != nullptr && progress_interval != 0 && iteration >= next_progress) {
+            progress_cb(next_progress, progress_user_data);
+            next_progress += progress_interval;
+        }
+
+	        uint64_t stride = buckets.checkpoint_stride();
+	        if (stride != 0 && iteration % stride == 0) {
+	            uint64_t pos = iteration / stride;
+	            if (pos < buckets.checkpoint_limit()) {
+	                form checkpoint;
+	                auto started_at = std::chrono::steady_clock::time_point{};
+	                const bool do_stats = buckets.stats_ok();
+	                if (do_stats) {
+	                    started_at = std::chrono::steady_clock::now();
+	                }
+	                SetForm(type, data, &checkpoint);
+	                buckets.process_checkpoint(pos, checkpoint);
+	                if (do_stats) {
+	                    buckets.record_checkpoint_event_ns(static_cast<uint64_t>(
+	                        std::chrono::duration_cast<std::chrono::nanoseconds>(
+	                            std::chrono::steady_clock::now() - started_at)
+	                            .count()));
+	                }
+	            }
+	        }
+
+        if (iteration == buckets.wanted_iterations()) {
+            SetForm(type, data, &result);
+            has_result = true;
+        }
+	    }
+
+	    void process_checkpoint(uint64_t i, const form& checkpoint, bool record_stats = true) {
+	        buckets.process_checkpoint(i, checkpoint, record_stats);
+	    }
+
+    bool ok() const { return has_result; }
+
+	    const form& y() const { return result; }
+
+	    form finalize_proof() const { return buckets.finalize_proof(); }
+
+	    bool stats_ok() const { return buckets.stats_ok(); }
+
+	    LastStreamingStats stats() const { return buckets.stats(); }
+
+	  private:
+	    StreamingWesolowskiBuckets buckets;
+    uint64_t progress_interval;
+    ChiavdfProgressCallback progress_cb;
+    void* progress_user_data;
+    uint64_t next_progress;
+
+    form result;
+    bool has_result = false;
+};
+
+struct BatchJobState {
+    size_t index;
+    uint64_t wanted_iter;
+    uint32_t k;
+    uint32_t l;
+    uint64_t kl;
+    uint64_t limit;
+    form y_ref;
+    StreamingWesolowskiBuckets buckets;
+    uint64_t next_checkpoint_t;
+    bool done = false;
+
+    BatchJobState(
+        size_t index,
+        uint64_t wanted_iter,
+        uint32_t k,
+        uint32_t l,
+        uint64_t limit,
+        form y_ref,
+        StreamingWesolowskiBuckets buckets)
+        : index(index),
+          wanted_iter(wanted_iter),
+          k(k),
+          l(l),
+          kl(static_cast<uint64_t>(k) * static_cast<uint64_t>(l)),
+          limit(limit),
+          y_ref(std::move(y_ref)),
+          buckets(std::move(buckets)),
+          next_checkpoint_t(static_cast<uint64_t>(k) * static_cast<uint64_t>(l)) {}
+};
+
+class BatchOneWesolowskiCallback final : public WesolowskiCallback {
+  public:
+    BatchOneWesolowskiCallback(
+        integer& D,
+        const integer& shared_D,
+        const integer& shared_L,
+        int d_bits,
+        ChiavdfByteArray* out_arrays,
+        size_t job_count,
+        std::atomic<bool>& stopped,
+        std::vector<BatchJobState> jobs,
+        uint64_t progress_interval,
+        ChiavdfProgressCallback progress_cb,
+        void* progress_user_data)
+        : WesolowskiCallback(D),
+          shared_D(shared_D),
+          shared_L(shared_L),
+          d_bits(d_bits),
+          out_arrays(out_arrays),
+          job_count(job_count),
+          stopped(stopped),
+          jobs(std::move(jobs)),
+          progress_interval(progress_interval),
+          progress_cb(progress_cb),
+          progress_user_data(progress_user_data),
+          next_progress(progress_interval) {}
+
+	    void initialize(const form& x0) {
+	        for (auto& job : jobs) {
+	            job.buckets.process_checkpoint(/*i=*/0, x0, /*record_stats=*/false);
+
+	            // Set first checkpoint event at t=kl, but only if it corresponds to i < limit.
+	            if (job.limit <= 1) {
+	                job.next_checkpoint_t = std::numeric_limits<uint64_t>::max();
+            }
+        }
+        recompute_next_event();
+    }
+
+    void OnIteration(int type, void* data, uint64_t iteration) override {
+        iteration++;
+        if (progress_cb != nullptr && progress_interval != 0 && iteration >= next_progress) {
+            progress_cb(next_progress, progress_user_data);
+            next_progress += progress_interval;
+        }
+        if (iteration != next_event) {
+            return;
+        }
+
+        form checkpoint;
+        SetForm(type, data, &checkpoint);
+
+        for (auto& job : jobs) {
+            if (job.done) {
+                continue;
+            }
+
+            if (job.next_checkpoint_t == iteration) {
+                uint64_t i = iteration / job.kl;
+                if (i < job.limit) {
+                    job.buckets.process_checkpoint(i, checkpoint);
+                }
+                job.next_checkpoint_t += job.kl;
+            }
+
+            if (job.wanted_iter == iteration) {
+                if (!(checkpoint == job.y_ref)) {
+                    fatal_error = true;
+                    stopped.store(true);
+                    return;
+                }
+                spawn_finalize_job(job);
+            }
+        }
+
+        recompute_next_event();
+    }
+
+    bool ok() const { return !fatal_error; }
+
+    void join_finalizers() {
+        for (auto& t : finalizers) {
+            if (t.joinable()) {
+                t.join();
+            }
+        }
+        finalizers.clear();
+    }
+
+  private:
+    void spawn_finalize_job(BatchJobState& job) {
+        job.done = true;
+        job.next_checkpoint_t = std::numeric_limits<uint64_t>::max();
+
+        size_t idx = job.index;
+        form y_ref = std::move(job.y_ref);
+        StreamingWesolowskiBuckets buckets = std::move(job.buckets);
+
+        finalizers.emplace_back([this, idx, y_ref = std::move(y_ref), buckets = std::move(buckets)]() mutable {
+            try {
+                form proof_form = buckets.finalize_proof();
+                std::vector<unsigned char> y_serialized = SerializeForm(y_ref, d_bits);
+                std::vector<unsigned char> proof_serialized = SerializeForm(proof_form, d_bits);
+                if (y_serialized.empty() || proof_serialized.empty()) {
+                    out_arrays[idx] = empty_result();
+                    return;
+                }
+
+                const size_t total = y_serialized.size() + proof_serialized.size();
+                uint8_t* out = new uint8_t[total];
+                std::copy(y_serialized.begin(), y_serialized.end(), out);
+                std::copy(proof_serialized.begin(), proof_serialized.end(), out + y_serialized.size());
+                out_arrays[idx] = ChiavdfByteArray{out, total};
+            } catch (...) {
+                out_arrays[idx] = empty_result();
+            }
+        });
+    }
+
+    void recompute_next_event() {
+        uint64_t next = std::numeric_limits<uint64_t>::max();
+        bool any_active = false;
+
+        for (const auto& job : jobs) {
+            if (job.done) {
+                continue;
+            }
+            any_active = true;
+            next = std::min(next, job.wanted_iter);
+            next = std::min(next, job.next_checkpoint_t);
+        }
+
+        if (!any_active) {
+            stopped.store(true);
+        }
+        next_event = next;
+    }
+
+    const integer& shared_D;
+    const integer& shared_L;
+    int d_bits;
+    ChiavdfByteArray* out_arrays;
+    size_t job_count;
+    std::atomic<bool>& stopped;
+    std::vector<BatchJobState> jobs;
+    std::vector<std::thread> finalizers;
+    uint64_t next_event = std::numeric_limits<uint64_t>::max();
+    uint64_t progress_interval;
+    ChiavdfProgressCallback progress_cb;
+    void* progress_user_data;
+    uint64_t next_progress;
+    bool fatal_error = false;
+};
+
 ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_impl(
     const uint8_t* challenge_hash,
     size_t challenge_size,
@@ -702,7 +882,7 @@ ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_impl(
         k,
         l,
         limit,
-        B,
+        std::move(B),
         use_getblock_opt,
         progress_interval,
         progress_cb,
@@ -973,6 +1153,27 @@ extern "C" bool chiavdf_get_last_streaming_stats(
     return true;
 }
 
+extern "C" ChiavdfByteArray* chiavdf_prove_one_weso_fast_streaming_getblock_opt_batch(
+    const uint8_t* challenge_hash,
+    size_t challenge_size,
+    const uint8_t* x_s,
+    size_t x_s_size,
+    size_t discriminant_size_bits,
+    const ChiavdfBatchJob* jobs,
+    size_t job_count) {
+    return chiavdf_prove_one_weso_fast_streaming_getblock_opt_batch_with_progress(
+        challenge_hash,
+        challenge_size,
+        x_s,
+        x_s_size,
+        discriminant_size_bits,
+        jobs,
+        job_count,
+        /*progress_interval=*/0,
+        /*progress_cb=*/nullptr,
+        /*progress_user_data=*/nullptr);
+}
+
 extern "C" ChiavdfByteArray* chiavdf_prove_one_weso_fast_streaming_getblock_opt_batch_with_progress(
     const uint8_t* challenge_hash,
     size_t challenge_size,
@@ -984,85 +1185,145 @@ extern "C" ChiavdfByteArray* chiavdf_prove_one_weso_fast_streaming_getblock_opt_
     uint64_t progress_interval,
     ChiavdfProgressCallback progress_cb,
     void* progress_user_data) {
-    if (challenge_hash == nullptr || challenge_size == 0 || x_s == nullptr || x_s_size == 0) {
-        return nullptr;
-    }
-    if (discriminant_size_bits == 0 || jobs == nullptr || job_count == 0) {
-        return nullptr;
-    }
-
-    ChiavdfByteArray* out_arrays = nullptr;
     try {
-        out_arrays = new ChiavdfByteArray[job_count];
-        for (size_t idx = 0; idx < job_count; ++idx) {
-            out_arrays[idx] = empty_result();
+        std::call_once(init_once, init_chiavdf_fast);
+
+        if (challenge_hash == nullptr || challenge_size == 0 || x_s == nullptr || x_s_size == 0 ||
+            jobs == nullptr || job_count == 0 || discriminant_size_bits == 0) {
+            return nullptr;
         }
 
-        uint64_t completed_iters = 0;
-        for (size_t idx = 0; idx < job_count; ++idx) {
-            const ChiavdfBatchJob& job = jobs[idx];
-            if (job.y_ref_s == nullptr || job.y_ref_s_size == 0 || job.num_iterations == 0) {
-                free_byte_array_batch_internal(out_arrays, job_count);
+        for (size_t idx = 0; idx < job_count; idx++) {
+            if (jobs[idx].y_ref_s == nullptr || jobs[idx].y_ref_s_size == 0 ||
+                jobs[idx].num_iterations == 0) {
                 return nullptr;
             }
+        }
+
+        std::vector<uint8_t> challenge_hash_bytes(challenge_hash, challenge_hash + challenge_size);
+        integer D = CreateDiscriminant(challenge_hash_bytes, static_cast<int>(discriminant_size_bits));
+        integer L = root(-D, 4);
+
+        form x0 = DeserializeForm(D, x_s, x_s_size);
+
+        int d_bits = D.num_bits();
 
-            BatchProgressContext progress_ctx;
-            progress_ctx.completed_before = completed_iters;
-            progress_ctx.progress_cb = progress_cb;
-            progress_ctx.progress_user_data = progress_user_data;
-            const bool use_progress = progress_cb != nullptr && progress_interval != 0;
-
-            out_arrays[idx] = chiavdf_prove_one_weso_fast_streaming_getblock_opt_with_progress(
-                challenge_hash,
-                challenge_size,
-                x_s,
-                x_s_size,
-                job.y_ref_s,
-                job.y_ref_s_size,
-                discriminant_size_bits,
-                job.num_iterations,
-                progress_interval,
-                use_progress ? batch_progress_trampoline : nullptr,
-                use_progress ? static_cast<void*>(&progress_ctx) : nullptr);
-
-            if (out_arrays[idx].data == nullptr || out_arrays[idx].length == 0) {
-                free_byte_array_batch_internal(out_arrays, job_count);
+        auto* out_arrays = new ChiavdfByteArray[job_count]();
+
+        uint64_t t_max = 0;
+        std::vector<BatchJobState> job_states;
+        job_states.reserve(job_count);
+
+        const uint64_t budget = bucket_memory_budget_bytes.load(std::memory_order_relaxed);
+        const uint64_t per_job_budget = (budget == 0 || job_count == 0)
+                                            ? budget
+                                            : (budget / static_cast<uint64_t>(job_count));
+
+        for (size_t idx = 0; idx < job_count; idx++) {
+            uint64_t num_iterations = jobs[idx].num_iterations;
+            t_max = std::max(t_max, num_iterations);
+
+            form y_ref = DeserializeForm(D, jobs[idx].y_ref_s, jobs[idx].y_ref_s_size);
+
+            uint32_t k;
+            uint32_t l;
+            bool tuned = false;
+            if (num_iterations >= (1 << 16)) {
+                tuned = tune_streaming_parameters(
+                    num_iterations,
+                    discriminant_size_bits,
+                    per_job_budget,
+                    l,
+                    k);
+            }
+            if (!tuned) {
+                if (num_iterations >= (1 << 16)) {
+                    ApproximateParameters(num_iterations, l, k);
+                } else {
+                    k = 10;
+                    l = 1;
+                }
+            }
+            if (k == 0) {
+                k = 1;
+            }
+            if (l == 0) {
+                l = 1;
+            }
+
+            uint64_t kl = static_cast<uint64_t>(k) * static_cast<uint64_t>(l);
+            uint64_t limit = num_iterations / kl;
+            if (num_iterations % kl) {
+                limit++;
+            }
+
+            integer B = GetB(D, x0, y_ref);
+
+            StreamingWesolowskiBuckets buckets(
+                D,
+                L,
+                num_iterations,
+                k,
+                l,
+                limit,
+                std::move(B),
+                /*use_getblock_opt=*/true);
+
+            if (!buckets.init_ok()) {
+                chiavdf_free_byte_array_batch(out_arrays, job_count);
                 return nullptr;
             }
 
-            completed_iters = saturating_add_u64(completed_iters, job.num_iterations);
+            job_states.emplace_back(
+                idx,
+                num_iterations,
+                k,
+                l,
+                limit,
+                std::move(y_ref),
+                std::move(buckets));
+        }
+
+        std::atomic<bool> stopped(false);
+        BatchOneWesolowskiCallback weso(
+            D,
+            D,
+            L,
+            d_bits,
+            out_arrays,
+            job_count,
+            stopped,
+            std::move(job_states),
+            progress_interval,
+            progress_cb,
+            progress_user_data);
+        weso.initialize(x0);
+
+        FastStorage* fast_storage = nullptr;
+        repeated_square(t_max, x0, D, L, &weso, fast_storage, stopped);
+
+        weso.join_finalizers();
+
+        if (!weso.ok()) {
+            chiavdf_free_byte_array_batch(out_arrays, job_count);
+            return nullptr;
         }
 
         return out_arrays;
     } catch (...) {
-        free_byte_array_batch_internal(out_arrays, job_count);
         return nullptr;
     }
 }
 
-extern "C" ChiavdfByteArray* chiavdf_prove_one_weso_fast_streaming_getblock_opt_batch(
-    const uint8_t* challenge_hash,
-    size_t challenge_size,
-    const uint8_t* x_s,
-    size_t x_s_size,
-    size_t discriminant_size_bits,
-    const ChiavdfBatchJob* jobs,
-    size_t job_count) {
-    return chiavdf_prove_one_weso_fast_streaming_getblock_opt_batch_with_progress(
-        challenge_hash,
-        challenge_size,
-        x_s,
-        x_s_size,
-        discriminant_size_bits,
-        jobs,
-        job_count,
-        /*progress_interval=*/0,
-        /*progress_cb=*/nullptr,
-        /*progress_user_data=*/nullptr);
-}
-
 extern "C" void chiavdf_free_byte_array_batch(ChiavdfByteArray* arrays, size_t count) {
-    free_byte_array_batch_internal(arrays, count);
+    if (arrays == nullptr) {
+        return;
+    }
+    for (size_t idx = 0; idx < count; idx++) {
+        delete[] arrays[idx].data;
+        arrays[idx] = empty_result();
+    }
+    delete[] arrays;
 }
 
 extern "C" void chiavdf_free_byte_array(ChiavdfByteArray array) { delete[] array.data; }
diff --git a/src/c_bindings/fast_wrapper.h b/src/c_bindings/fast_wrapper.h
index 115c3abd..a83bd746 100644
--- a/src/c_bindings/fast_wrapper.h
+++ b/src/c_bindings/fast_wrapper.h
@@ -55,6 +55,12 @@ bool chiavdf_get_last_streaming_stats(
     uint64_t* out_checkpoint_calls,
     uint64_t* out_bucket_updates);
 
+typedef struct {
+    const uint8_t* y_ref_s;
+    size_t y_ref_s_size;
+    uint64_t num_iterations;
+} ChiavdfBatchJob;
+
 // Computes a compact (witness_type=0) Wesolowski proof using the fast engine.
 //
 // On success, returns `y || proof` where:
@@ -144,8 +150,20 @@ ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt_with_progres
     ChiavdfProgressCallback progress_cb,
     void* progress_user_data);
 
-// Batch variant: computes one proof per `jobs[i]` using a shared API surface.
-// Returns an array of `job_count` results on success; caller owns/frees it.
+// Computes multiple compact (witness_type=0) Wesolowski proofs in one shared
+// squaring run ("Trick 2"), using the streaming algorithm (Trick 1) and the
+// GetBlock precomputation optimization.
+//
+// All jobs in the batch must share the same:
+// - `challenge_hash`
+// - `x_s` (input form bytes)
+// - `discriminant_size_bits`
+//
+// Returns an array of `job_count` byte arrays, each containing `y || proof` on
+// success. The caller must free the returned array using
+// `chiavdf_free_byte_array_batch(...)`.
+//
+// On fatal error (including output mismatch), returns NULL.
 ChiavdfByteArray* chiavdf_prove_one_weso_fast_streaming_getblock_opt_batch(
     const uint8_t* challenge_hash,
     size_t challenge_size,
@@ -155,7 +173,9 @@ ChiavdfByteArray* chiavdf_prove_one_weso_fast_streaming_getblock_opt_batch(
     const ChiavdfBatchJob* jobs,
     size_t job_count);
 
-// Same as batch API above, with optional aggregate progress callback.
+// Same as `chiavdf_prove_one_weso_fast_streaming_getblock_opt_batch`, but
+// optionally invokes `progress_cb` from the proving thread every
+// `progress_interval` squaring iterations completed.
 ChiavdfByteArray* chiavdf_prove_one_weso_fast_streaming_getblock_opt_batch_with_progress(
     const uint8_t* challenge_hash,
     size_t challenge_size,

From 81b7b0dec9b0975ef570c874d61df21c2d3cd8a0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aur=C3=A9lien=20Mora?= <ealrann@gmail.com>
Date: Wed, 28 Jan 2026 02:13:48 +0100
Subject: [PATCH 15/21] Event queue

(cherry picked from commit 750df6648d0020cda0c22df55478fe4953b5ecba)
---
 src/c_bindings/fast_wrapper.cpp | 106 +++++++++++++++++++++-----------
 1 file changed, 71 insertions(+), 35 deletions(-)

diff --git a/src/c_bindings/fast_wrapper.cpp b/src/c_bindings/fast_wrapper.cpp
index d3ffef65..772d6758 100644
--- a/src/c_bindings/fast_wrapper.cpp
+++ b/src/c_bindings/fast_wrapper.cpp
@@ -7,6 +7,7 @@
 #include <cstdio>
 #include <limits>
 #include <mutex>
+#include <queue>
 #include <thread>
 #include <vector>
 
@@ -628,6 +629,7 @@ struct BatchJobState {
     form y_ref;
     StreamingWesolowskiBuckets buckets;
     uint64_t next_checkpoint_t;
+    uint64_t next_event_t;
     bool done = false;
 
     BatchJobState(
@@ -646,7 +648,10 @@ struct BatchJobState {
           limit(limit),
           y_ref(std::move(y_ref)),
           buckets(std::move(buckets)),
-          next_checkpoint_t(static_cast<uint64_t>(k) * static_cast<uint64_t>(l)) {}
+          next_checkpoint_t(
+              (limit <= 1) ? std::numeric_limits<uint64_t>::max()
+                           : (static_cast<uint64_t>(k) * static_cast<uint64_t>(l))),
+          next_event_t(std::min(wanted_iter, next_checkpoint_t)) {}
 };
 
 class BatchOneWesolowskiCallback final : public WesolowskiCallback {
@@ -673,20 +678,17 @@ class BatchOneWesolowskiCallback final : public WesolowskiCallback {
           jobs(std::move(jobs)),
           progress_interval(progress_interval),
           progress_cb(progress_cb),
-          progress_user_data(progress_user_data),
-          next_progress(progress_interval) {}
+	          progress_user_data(progress_user_data),
+	          next_progress(progress_interval) {}
 
 	    void initialize(const form& x0) {
-	        for (auto& job : jobs) {
+	        for (size_t job_pos = 0; job_pos < jobs.size(); job_pos++) {
+	            auto& job = jobs[job_pos];
 	            job.buckets.process_checkpoint(/*i=*/0, x0, /*record_stats=*/false);
-
-	            // Set first checkpoint event at t=kl, but only if it corresponds to i < limit.
-	            if (job.limit <= 1) {
-	                job.next_checkpoint_t = std::numeric_limits<uint64_t>::max();
-            }
-        }
-        recompute_next_event();
-    }
+	            schedule_job(job_pos);
+	        }
+	        refresh_next_event();
+	    }
 
     void OnIteration(int type, void* data, uint64_t iteration) override {
         iteration++;
@@ -701,8 +703,15 @@ class BatchOneWesolowskiCallback final : public WesolowskiCallback {
         form checkpoint;
         SetForm(type, data, &checkpoint);
 
-        for (auto& job : jobs) {
-            if (job.done) {
+        while (!event_queue.empty()) {
+            const JobEvent next = event_queue.top();
+            if (next.t != iteration) {
+                break;
+            }
+            event_queue.pop();
+
+            BatchJobState& job = jobs[next.job_pos];
+            if (job.done || job.next_event_t != iteration) {
                 continue;
             }
 
@@ -711,7 +720,13 @@ class BatchOneWesolowskiCallback final : public WesolowskiCallback {
                 if (i < job.limit) {
                     job.buckets.process_checkpoint(i, checkpoint);
                 }
-                job.next_checkpoint_t += job.kl;
+
+                const uint64_t next_i = i + 1;
+                if (next_i < job.limit) {
+                    job.next_checkpoint_t = next_i * job.kl;
+                } else {
+                    job.next_checkpoint_t = std::numeric_limits<uint64_t>::max();
+                }
             }
 
             if (job.wanted_iter == iteration) {
@@ -722,9 +737,13 @@ class BatchOneWesolowskiCallback final : public WesolowskiCallback {
                 }
                 spawn_finalize_job(job);
             }
+
+            if (!job.done) {
+                schedule_job(next.job_pos);
+            }
         }
 
-        recompute_next_event();
+        refresh_next_event();
     }
 
     bool ok() const { return !fatal_error; }
@@ -739,9 +758,44 @@ class BatchOneWesolowskiCallback final : public WesolowskiCallback {
     }
 
   private:
+    struct JobEvent {
+        uint64_t t;
+        size_t job_pos;
+    };
+
+    struct JobEventGreater {
+        bool operator()(const JobEvent& a, const JobEvent& b) const noexcept { return a.t > b.t; }
+    };
+
+    void schedule_job(size_t job_pos) {
+        BatchJobState& job = jobs[job_pos];
+        if (job.done) {
+            job.next_event_t = std::numeric_limits<uint64_t>::max();
+            return;
+        }
+        job.next_event_t = std::min(job.wanted_iter, job.next_checkpoint_t);
+        event_queue.push(JobEvent{job.next_event_t, job_pos});
+    }
+
+    void refresh_next_event() {
+        while (!event_queue.empty()) {
+            const JobEvent next = event_queue.top();
+            const BatchJobState& job = jobs[next.job_pos];
+            if (job.done || job.next_event_t != next.t) {
+                event_queue.pop();
+                continue;
+            }
+            next_event = next.t;
+            return;
+        }
+        next_event = std::numeric_limits<uint64_t>::max();
+        stopped.store(true);
+    }
+
     void spawn_finalize_job(BatchJobState& job) {
         job.done = true;
         job.next_checkpoint_t = std::numeric_limits<uint64_t>::max();
+        job.next_event_t = std::numeric_limits<uint64_t>::max();
 
         size_t idx = job.index;
         form y_ref = std::move(job.y_ref);
@@ -768,25 +822,6 @@ class BatchOneWesolowskiCallback final : public WesolowskiCallback {
         });
     }
 
-    void recompute_next_event() {
-        uint64_t next = std::numeric_limits<uint64_t>::max();
-        bool any_active = false;
-
-        for (const auto& job : jobs) {
-            if (job.done) {
-                continue;
-            }
-            any_active = true;
-            next = std::min(next, job.wanted_iter);
-            next = std::min(next, job.next_checkpoint_t);
-        }
-
-        if (!any_active) {
-            stopped.store(true);
-        }
-        next_event = next;
-    }
-
     const integer& shared_D;
     const integer& shared_L;
     int d_bits;
@@ -795,6 +830,7 @@ class BatchOneWesolowskiCallback final : public WesolowskiCallback {
     std::atomic<bool>& stopped;
     std::vector<BatchJobState> jobs;
     std::vector<std::thread> finalizers;
+    std::priority_queue<JobEvent, std::vector<JobEvent>, JobEventGreater> event_queue;
     uint64_t next_event = std::numeric_limits<uint64_t>::max();
     uint64_t progress_interval;
     ChiavdfProgressCallback progress_cb;

From ea0a0ab6ec581308ec709849cc1e7d171140153f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aur=C3=A9lien=20Mora?= <ealrann@gmail.com>
Date: Fri, 13 Feb 2026 03:28:58 +0100
Subject: [PATCH 16/21] Fix unbounded RSS growth

(cherry picked from commit 445cb0d54fbd43e479e9479c2605bbd3d0c2ddd5)
---
 src/c_bindings/fast_wrapper.cpp | 202 +++++++++++++++++++++++++++-----
 1 file changed, 171 insertions(+), 31 deletions(-)

diff --git a/src/c_bindings/fast_wrapper.cpp b/src/c_bindings/fast_wrapper.cpp
index 772d6758..a3b8358d 100644
--- a/src/c_bindings/fast_wrapper.cpp
+++ b/src/c_bindings/fast_wrapper.cpp
@@ -6,6 +6,7 @@
 #include <chrono>
 #include <cstdio>
 #include <limits>
+#include <memory>
 #include <mutex>
 #include <queue>
 #include <thread>
@@ -654,6 +655,163 @@ struct BatchJobState {
           next_event_t(std::min(wanted_iter, next_checkpoint_t)) {}
 };
 
+struct BatchFinalizeLatch {
+    void add_task() {
+        std::lock_guard<std::mutex> lk(mutex);
+        remaining++;
+    }
+
+    void task_done() {
+        std::lock_guard<std::mutex> lk(mutex);
+        if (remaining > 0) {
+            remaining--;
+        }
+        if (remaining == 0) {
+            cv.notify_all();
+        }
+    }
+
+    void wait() {
+        std::unique_lock<std::mutex> lk(mutex);
+        cv.wait(lk, [this]() { return remaining == 0; });
+    }
+
+  private:
+    std::mutex mutex;
+    std::condition_variable cv;
+    size_t remaining = 0;
+};
+
+struct BatchFinalizeTask {
+    size_t idx;
+    int d_bits;
+    ChiavdfByteArray* out_arrays;
+    form y_ref;
+    StreamingWesolowskiBuckets buckets;
+    std::shared_ptr<BatchFinalizeLatch> latch;
+
+    BatchFinalizeTask(
+        size_t idx,
+        int d_bits,
+        ChiavdfByteArray* out_arrays,
+        form y_ref,
+        StreamingWesolowskiBuckets buckets,
+        std::shared_ptr<BatchFinalizeLatch> latch)
+        : idx(idx),
+          d_bits(d_bits),
+          out_arrays(out_arrays),
+          y_ref(std::move(y_ref)),
+          buckets(std::move(buckets)),
+          latch(std::move(latch)) {}
+};
+
+class GlobalBatchFinalizerPool final {
+  public:
+    static GlobalBatchFinalizerPool& instance() {
+        static GlobalBatchFinalizerPool pool;
+        return pool;
+    }
+
+    void enqueue(BatchFinalizeTask task) {
+        {
+            std::lock_guard<std::mutex> lk(mutex);
+            queue.push(std::move(task));
+        }
+        cv.notify_one();
+    }
+
+  private:
+    GlobalBatchFinalizerPool() { start_workers(); }
+
+    ~GlobalBatchFinalizerPool() {
+        {
+            std::lock_guard<std::mutex> lk(mutex);
+            shutdown = true;
+        }
+        cv.notify_all();
+        for (auto& t : workers) {
+            if (t.joinable()) {
+                t.join();
+            }
+        }
+    }
+
+    GlobalBatchFinalizerPool(const GlobalBatchFinalizerPool&) = delete;
+    GlobalBatchFinalizerPool& operator=(const GlobalBatchFinalizerPool&) = delete;
+
+    void start_workers() {
+        size_t count = std::thread::hardware_concurrency();
+        if (count == 0) {
+            count = 1;
+        }
+        // Keep this intentionally small: the caller already runs many compute
+        // threads (Rust `-p` workers), and each extra C++ worker carries large
+        // thread-local GMP scratch state (NUCOMP/NUDUPL).
+        count = std::max<size_t>(1, count / 4);
+        count = std::min<size_t>(count, 8);
+
+        workers.reserve(count);
+        for (size_t i = 0; i < count; i++) {
+            workers.emplace_back([this]() { worker_loop(); });
+        }
+    }
+
+    std::optional<BatchFinalizeTask> take_task() {
+        std::unique_lock<std::mutex> lk(mutex);
+        cv.wait(lk, [this]() { return shutdown || !queue.empty(); });
+        if (queue.empty()) {
+            return std::nullopt;
+        }
+        BatchFinalizeTask task = std::move(queue.front());
+        queue.pop();
+        return std::make_optional<BatchFinalizeTask>(std::move(task));
+    }
+
+    void worker_loop() {
+        PulmarkReducer reducer;
+        while (true) {
+            auto task_opt = take_task();
+            if (!task_opt.has_value()) {
+                return;
+            }
+            BatchFinalizeTask task = std::move(*task_opt);
+
+            struct LatchGuard {
+                std::shared_ptr<BatchFinalizeLatch> latch;
+                ~LatchGuard() {
+                    if (latch) {
+                        latch->task_done();
+                    }
+                }
+            } guard{task.latch};
+
+            try {
+                form proof_form = task.buckets.finalize_proof_with_reducer(reducer);
+                std::vector<unsigned char> y_serialized = SerializeForm(task.y_ref, task.d_bits);
+                std::vector<unsigned char> proof_serialized = SerializeForm(proof_form, task.d_bits);
+                if (y_serialized.empty() || proof_serialized.empty()) {
+                    task.out_arrays[task.idx] = empty_result();
+                    continue;
+                }
+
+                const size_t total = y_serialized.size() + proof_serialized.size();
+                uint8_t* out = new uint8_t[total];
+                std::copy(y_serialized.begin(), y_serialized.end(), out);
+                std::copy(proof_serialized.begin(), proof_serialized.end(), out + y_serialized.size());
+                task.out_arrays[task.idx] = ChiavdfByteArray{out, total};
+            } catch (...) {
+                task.out_arrays[task.idx] = empty_result();
+            }
+        }
+    }
+
+    std::vector<std::thread> workers;
+    std::mutex mutex;
+    std::condition_variable cv;
+    std::queue<BatchFinalizeTask> queue;
+    bool shutdown = false;
+};
+
 class BatchOneWesolowskiCallback final : public WesolowskiCallback {
   public:
     BatchOneWesolowskiCallback(
@@ -679,7 +837,9 @@ class BatchOneWesolowskiCallback final : public WesolowskiCallback {
           progress_interval(progress_interval),
           progress_cb(progress_cb),
 	          progress_user_data(progress_user_data),
-	          next_progress(progress_interval) {}
+	          next_progress(progress_interval),
+          finalizer_latch(std::make_shared<BatchFinalizeLatch>()) {
+    }
 
 	    void initialize(const form& x0) {
 	        for (size_t job_pos = 0; job_pos < jobs.size(); job_pos++) {
@@ -749,12 +909,7 @@ class BatchOneWesolowskiCallback final : public WesolowskiCallback {
     bool ok() const { return !fatal_error; }
 
     void join_finalizers() {
-        for (auto& t : finalizers) {
-            if (t.joinable()) {
-                t.join();
-            }
-        }
-        finalizers.clear();
+        finalizer_latch->wait();
     }
 
   private:
@@ -797,29 +952,14 @@ class BatchOneWesolowskiCallback final : public WesolowskiCallback {
         job.next_checkpoint_t = std::numeric_limits<uint64_t>::max();
         job.next_event_t = std::numeric_limits<uint64_t>::max();
 
-        size_t idx = job.index;
-        form y_ref = std::move(job.y_ref);
-        StreamingWesolowskiBuckets buckets = std::move(job.buckets);
-
-        finalizers.emplace_back([this, idx, y_ref = std::move(y_ref), buckets = std::move(buckets)]() mutable {
-            try {
-                form proof_form = buckets.finalize_proof();
-                std::vector<unsigned char> y_serialized = SerializeForm(y_ref, d_bits);
-                std::vector<unsigned char> proof_serialized = SerializeForm(proof_form, d_bits);
-                if (y_serialized.empty() || proof_serialized.empty()) {
-                    out_arrays[idx] = empty_result();
-                    return;
-                }
-
-                const size_t total = y_serialized.size() + proof_serialized.size();
-                uint8_t* out = new uint8_t[total];
-                std::copy(y_serialized.begin(), y_serialized.end(), out);
-                std::copy(proof_serialized.begin(), proof_serialized.end(), out + y_serialized.size());
-                out_arrays[idx] = ChiavdfByteArray{out, total};
-            } catch (...) {
-                out_arrays[idx] = empty_result();
-            }
-        });
+        finalizer_latch->add_task();
+        GlobalBatchFinalizerPool::instance().enqueue(BatchFinalizeTask(
+            job.index,
+            d_bits,
+            out_arrays,
+            std::move(job.y_ref),
+            std::move(job.buckets),
+            finalizer_latch));
     }
 
     const integer& shared_D;
@@ -829,7 +969,7 @@ class BatchOneWesolowskiCallback final : public WesolowskiCallback {
     size_t job_count;
     std::atomic<bool>& stopped;
     std::vector<BatchJobState> jobs;
-    std::vector<std::thread> finalizers;
+    std::shared_ptr<BatchFinalizeLatch> finalizer_latch;
     std::priority_queue<JobEvent, std::vector<JobEvent>, JobEventGreater> event_queue;
     uint64_t next_event = std::numeric_limits<uint64_t>::max();
     uint64_t progress_interval;

From 813e5b8b6ac7f33961ea8737ccc6fd3f674b8b1c Mon Sep 17 00:00:00 2001
From: Gene Hoffman <hoffmang@hoffmang.com>
Date: Tue, 12 May 2026 21:28:29 -0700
Subject: [PATCH 17/21] Fix batch fast-wrapper exception cleanup

Ensure batch proving joins pending finalizer work and frees allocated output arrays on exceptions so stack-referenced state cannot outlive the call frame. Also remove an unused internal batch-free helper that duplicated the public C API.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 src/c_bindings/fast_wrapper.cpp | 35 ++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/src/c_bindings/fast_wrapper.cpp b/src/c_bindings/fast_wrapper.cpp
index a3b8358d..90bab448 100644
--- a/src/c_bindings/fast_wrapper.cpp
+++ b/src/c_bindings/fast_wrapper.cpp
@@ -84,18 +84,6 @@ bool try_pow2_u64_shift(uint32_t shift, uint64_t& out) {
     return true;
 }
 
-void free_byte_array_batch_internal(ChiavdfByteArray* arrays, size_t count) {
-    if (arrays == nullptr) {
-        return;
-    }
-    for (size_t idx = 0; idx < count; ++idx) {
-        delete[] arrays[idx].data;
-        arrays[idx].data = nullptr;
-        arrays[idx].length = 0;
-    }
-    delete[] arrays;
-}
-
 struct BatchProgressContext {
     uint64_t completed_before = 0;
     ChiavdfProgressCallback progress_cb = nullptr;
@@ -1361,6 +1349,9 @@ extern "C" ChiavdfByteArray* chiavdf_prove_one_weso_fast_streaming_getblock_opt_
     uint64_t progress_interval,
     ChiavdfProgressCallback progress_cb,
     void* progress_user_data) {
+    ChiavdfByteArray* out_arrays = nullptr;
+    std::unique_ptr<BatchOneWesolowskiCallback> weso;
+    bool finalizers_joined = false;
     try {
         std::call_once(init_once, init_chiavdf_fast);
 
@@ -1384,7 +1375,7 @@ extern "C" ChiavdfByteArray* chiavdf_prove_one_weso_fast_streaming_getblock_opt_
 
         int d_bits = D.num_bits();
 
-        auto* out_arrays = new ChiavdfByteArray[job_count]();
+        out_arrays = new ChiavdfByteArray[job_count]();
 
         uint64_t t_max = 0;
         std::vector<BatchJobState> job_states;
@@ -1461,7 +1452,7 @@ extern "C" ChiavdfByteArray* chiavdf_prove_one_weso_fast_streaming_getblock_opt_
         }
 
         std::atomic<bool> stopped(false);
-        BatchOneWesolowskiCallback weso(
+        weso = std::make_unique<BatchOneWesolowskiCallback>(
             D,
             D,
             L,
@@ -1473,20 +1464,28 @@ extern "C" ChiavdfByteArray* chiavdf_prove_one_weso_fast_streaming_getblock_opt_
             progress_interval,
             progress_cb,
             progress_user_data);
-        weso.initialize(x0);
+        weso->initialize(x0);
 
         FastStorage* fast_storage = nullptr;
-        repeated_square(t_max, x0, D, L, &weso, fast_storage, stopped);
+        repeated_square(t_max, x0, D, L, weso.get(), fast_storage, stopped);
 
-        weso.join_finalizers();
+        weso->join_finalizers();
+        finalizers_joined = true;
 
-        if (!weso.ok()) {
+        if (!weso->ok()) {
             chiavdf_free_byte_array_batch(out_arrays, job_count);
             return nullptr;
         }
 
         return out_arrays;
     } catch (...) {
+        if (weso != nullptr && !finalizers_joined) {
+            try {
+                weso->join_finalizers();
+            } catch (...) {
+            }
+        }
+        chiavdf_free_byte_array_batch(out_arrays, job_count);
         return nullptr;
     }
 }

From b15f01bc772d612061764aebe7de52fa87e36736 Mon Sep 17 00:00:00 2001
From: Gene Hoffman <hoffmang@hoffmang.com>
Date: Tue, 12 May 2026 21:45:18 -0700
Subject: [PATCH 18/21] Fail closed on streaming replay corruption

Handle replay notifications in streaming Wesolowski callbacks by rejecting replayed batches instead of reusing irreversibly accumulated bucket state. This prevents silent incorrect proofs when the fast squaring path replays a corrupted batch.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 src/c_bindings/fast_wrapper.cpp | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/src/c_bindings/fast_wrapper.cpp b/src/c_bindings/fast_wrapper.cpp
index 90bab448..ec322d7c 100644
--- a/src/c_bindings/fast_wrapper.cpp
+++ b/src/c_bindings/fast_wrapper.cpp
@@ -545,7 +545,16 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
 
     bool init_ok() const { return buckets.init_ok(); }
 
+    void OnBatchReplay(uint64_t base_iteration, uint64_t batch_size) override {
+        (void)base_iteration;
+        (void)batch_size;
+        replayed_after_corruption = true;
+    }
+
     void OnIteration(int type, void* data, uint64_t iteration) override {
+        if (replayed_after_corruption) {
+            return;
+        }
         iteration++;
         if (iteration > buckets.wanted_iterations()) {
             return;
@@ -587,7 +596,7 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
 	        buckets.process_checkpoint(i, checkpoint, record_stats);
 	    }
 
-    bool ok() const { return has_result; }
+    bool ok() const { return has_result && !replayed_after_corruption; }
 
 	    const form& y() const { return result; }
 
@@ -606,6 +615,7 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
 
     form result;
     bool has_result = false;
+    bool replayed_after_corruption = false;
 };
 
 struct BatchJobState {
@@ -838,7 +848,18 @@ class BatchOneWesolowskiCallback final : public WesolowskiCallback {
 	        refresh_next_event();
 	    }
 
+    void OnBatchReplay(uint64_t base_iteration, uint64_t batch_size) override {
+        (void)base_iteration;
+        (void)batch_size;
+        // Streaming bucket updates are irreversible, so fail closed if replay is needed.
+        fatal_error = true;
+        stopped.store(true);
+    }
+
     void OnIteration(int type, void* data, uint64_t iteration) override {
+        if (fatal_error) {
+            return;
+        }
         iteration++;
         if (progress_cb != nullptr && progress_interval != 0 && iteration >= next_progress) {
             progress_cb(next_progress, progress_user_data);

From e508cceb3bfdf3234b7c2efeec020ef1b7e0e2bd Mon Sep 17 00:00:00 2001
From: Gene Hoffman <hoffmang@hoffmang.com>
Date: Tue, 12 May 2026 22:00:01 -0700
Subject: [PATCH 19/21] Fix batch callback lifetimes across exception cleanup

Keep discriminant and stop-flag state alive for catch-path finalizer joins so callback references never dangle during exception unwinding. This preserves safe cleanup when batch proving fails mid-flight.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 src/c_bindings/fast_wrapper.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/c_bindings/fast_wrapper.cpp b/src/c_bindings/fast_wrapper.cpp
index ec322d7c..98364119 100644
--- a/src/c_bindings/fast_wrapper.cpp
+++ b/src/c_bindings/fast_wrapper.cpp
@@ -1373,6 +1373,9 @@ extern "C" ChiavdfByteArray* chiavdf_prove_one_weso_fast_streaming_getblock_opt_
     ChiavdfByteArray* out_arrays = nullptr;
     std::unique_ptr<BatchOneWesolowskiCallback> weso;
     bool finalizers_joined = false;
+    integer D;
+    integer L;
+    std::atomic<bool> stopped(false);
     try {
         std::call_once(init_once, init_chiavdf_fast);
 
@@ -1389,8 +1392,8 @@ extern "C" ChiavdfByteArray* chiavdf_prove_one_weso_fast_streaming_getblock_opt_
         }
 
         std::vector<uint8_t> challenge_hash_bytes(challenge_hash, challenge_hash + challenge_size);
-        integer D = CreateDiscriminant(challenge_hash_bytes, static_cast<int>(discriminant_size_bits));
-        integer L = root(-D, 4);
+        D = CreateDiscriminant(challenge_hash_bytes, static_cast<int>(discriminant_size_bits));
+        L = root(-D, 4);
 
         form x0 = DeserializeForm(D, x_s, x_s_size);
 
@@ -1472,7 +1475,6 @@ extern "C" ChiavdfByteArray* chiavdf_prove_one_weso_fast_streaming_getblock_opt_
                 std::move(buckets));
         }
 
-        std::atomic<bool> stopped(false);
         weso = std::make_unique<BatchOneWesolowskiCallback>(
             D,
             D,

From 88602bafd6e58f1f1a82eef7cfc4663dec6593ed Mon Sep 17 00:00:00 2001
From: Gene Hoffman <hoffmang@hoffmang.com>
Date: Tue, 12 May 2026 22:17:15 -0700
Subject: [PATCH 20/21] Restore batch finalizer reducer API

Add a reducer-aware streaming proof finalization method used by batch worker threads and remove stale unused bucket replay members. This keeps batch finalization functional while cleaning dead scaffolding flagged by review.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 src/c_bindings/fast_wrapper.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/c_bindings/fast_wrapper.cpp b/src/c_bindings/fast_wrapper.cpp
index 98364119..a24e217f 100644
--- a/src/c_bindings/fast_wrapper.cpp
+++ b/src/c_bindings/fast_wrapper.cpp
@@ -344,12 +344,16 @@ class StreamingWesolowskiBuckets {
 	    bool init_ok() const { return getblock_ok; }
 
 	    form finalize_proof() const {
+        PulmarkReducer reducer;
+        return finalize_proof_with_reducer(reducer);
+    }
+
+    form finalize_proof_with_reducer(PulmarkReducer& reducer) const {
 	        auto started_at = std::chrono::steady_clock::time_point{};
 	        if (stats_enabled) {
 	            started_at = std::chrono::steady_clock::now();
 	        }
 
-	        PulmarkReducer reducer;
 	        form id = form::identity(D);
 
         uint64_t k1 = k / 2;
@@ -456,9 +460,6 @@ class StreamingWesolowskiBuckets {
     integer getblock_inv_2k;
     integer getblock_r;
     integer getblock_tmp;
-    uint64_t batch_start_iteration = 1;
-    uint64_t batch_end_iteration = 0;
-    std::vector<BatchCheckpoint> current_batch_checkpoints;
 
 	    bool stats_enabled;
 	    uint64_t checkpoint_total_ns = 0;

From 8e76d331dd95264b5ab3b43b5fbf21c81cbebf32 Mon Sep 17 00:00:00 2001
From: Gene Hoffman <hoffmang@hoffmang.com>
Date: Tue, 12 May 2026 22:28:50 -0700
Subject: [PATCH 21/21] Remove unused batch progress dead code

Drop the unused batch progress trampoline scaffolding in fast_wrapper to reduce maintenance noise and keep the batch callback flow aligned with current direct progress handling.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 src/c_bindings/fast_wrapper.cpp | 21 ---------------------
 1 file changed, 21 deletions(-)

diff --git a/src/c_bindings/fast_wrapper.cpp b/src/c_bindings/fast_wrapper.cpp
index a24e217f..9184311c 100644
--- a/src/c_bindings/fast_wrapper.cpp
+++ b/src/c_bindings/fast_wrapper.cpp
@@ -69,13 +69,6 @@ void init_chiavdf_fast() {
 
 ChiavdfByteArray empty_result() { return ChiavdfByteArray{nullptr, 0}; }
 
-uint64_t saturating_add_u64(uint64_t lhs, uint64_t rhs) {
-    if (lhs > std::numeric_limits<uint64_t>::max() - rhs) {
-        return std::numeric_limits<uint64_t>::max();
-    }
-    return lhs + rhs;
-}
-
 bool try_pow2_u64_shift(uint32_t shift, uint64_t& out) {
     if (shift >= 64) {
         return false;
@@ -84,20 +77,6 @@ bool try_pow2_u64_shift(uint32_t shift, uint64_t& out) {
     return true;
 }
 
-struct BatchProgressContext {
-    uint64_t completed_before = 0;
-    ChiavdfProgressCallback progress_cb = nullptr;
-    void* progress_user_data = nullptr;
-};
-
-void batch_progress_trampoline(uint64_t iters_done, void* user_data) {
-    auto* ctx = static_cast<BatchProgressContext*>(user_data);
-    if (ctx == nullptr || ctx->progress_cb == nullptr) {
-        return;
-    }
-    ctx->progress_cb(saturating_add_u64(ctx->completed_before, iters_done), ctx->progress_user_data);
-}
-
 uint64_t estimate_bucket_form_bytes(size_t discriminant_size_bits) {
     // Be conservative: class group forms contain 3 GMP-backed integers that
     // quickly grow to the discriminant size (or beyond) during NUCOMP.