From 7847c67b743c1f1f6c9cf218c57e8163f61816be Mon Sep 17 00:00:00 2001
From: Gene Hoffman <hoffmang@hoffmang.com>
Date: Mon, 23 Feb 2026 23:30:52 -0800
Subject: [PATCH 01/13] Add streaming one-wesolowski compaction APIs.

Introduce a fast C wrapper with streaming proof generation, incremental GetBlock optimization, and memory-budgeted (k,l) tuning, plus the minimal runtime/build infrastructure needed to embed chiavdf in multi-worker clients.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 docs/bluebox_compaction.md      |  49 ++
 src/Makefile.vdf-client         |  39 +-
 src/c_bindings/fast_wrapper.cpp | 795 ++++++++++++++++++++++++++++++++
 src/c_bindings/fast_wrapper.h   | 145 ++++++
 src/threading.h                 |   4 +-
 src/vdf.h                       |  29 +-
 6 files changed, 1044 insertions(+), 17 deletions(-)
 create mode 100644 docs/bluebox_compaction.md
 create mode 100644 src/c_bindings/fast_wrapper.cpp
 create mode 100644 src/c_bindings/fast_wrapper.h

diff --git a/docs/bluebox_compaction.md b/docs/bluebox_compaction.md
new file mode 100644
index 00000000..61cd1fd4
--- /dev/null
+++ b/docs/bluebox_compaction.md
@@ -0,0 +1,49 @@
+# Bluebox Compaction Optimizations
+
+This document describes the compaction-oriented proving path exposed by
+`src/c_bindings/fast_wrapper.h` and implemented in
+`src/c_bindings/fast_wrapper.cpp`.
+
+## Scope
+
+These APIs are intended for workloads where the expected VDF output (`y_ref`) is
+already known up front (for example, bluebox compaction jobs). They are additive
+and do not change the existing `c_wrapper` APIs.
+
+## Optimization 1: Streaming one-wesolowski
+
+Given `y_ref`, the prover computes:
+
+- `B = GetB(D, x, y_ref)` before squaring starts
+
+This enables a streaming algorithm that updates proof buckets at each
+checkpoint during repeated squaring, instead of materializing the full
+intermediate checkpoint array and scanning it after the loop. In practice this
+substantially reduces memory usage for compaction workloads.
+
+## Optimization 2: Incremental GetBlock mapping
+
+For streaming checkpoint updates, bucket index selection repeatedly calls
+`GetBlock(p, k, T, B)`. The optimized mode keeps a rolling modular state and
+advances sequential `p` values incrementally, avoiding full modular
+exponentiation per call and avoiding a large lookup table.
+
+## Optimization 3: Memory-budgeted (k, l) tuning
+
+The wrapper can tune `(k, l)` under a configured memory budget:
+
+- `chiavdf_set_bucket_memory_budget_bytes(...)`
+
+If no tuned candidate is found, the code falls back to the standard parameter
+heuristics.
+
+## Operational Notes
+
+- The `fast_wrapper` code path sets one-wesolowski mode and uses `quiet_mode` to
+  avoid unsolicited stdout noise when embedded in multi-worker clients.
+- Thread-slot assignment for the fast VDF counters is per-thread via
+  `vdf_fast_pairindex()`, avoiding slot collisions when multiple VDF computations
+  run in one process.
+- The production default for `enable_threads` in `parameters.h` is unchanged from
+  upstream to preserve timelord expectations.
+
diff --git a/src/Makefile.vdf-client b/src/Makefile.vdf-client
index 59fcbb63..0fe2380a 100644
--- a/src/Makefile.vdf-client
+++ b/src/Makefile.vdf-client
@@ -26,15 +26,26 @@ ifeq ($(UNAME),Darwin)
 NOPIE =
 endif
 
-CFLAGS += $(LTO_FLAGS) $(NOPIE)
-LDFLAGS += $(LTO_FLAGS) $(NOPIE) -g
+# Optional: set `PIC=1` to build position-independent objects.
+PIC ?= 0
+ifeq ($(PIC),1)
+PICFLAGS = -fPIC
+PIEFLAGS =
+else
+PICFLAGS =
+PIEFLAGS = $(NOPIE)
+endif
+
+CFLAGS += $(LTO_FLAGS) $(PIEFLAGS) $(PICFLAGS)
+LDFLAGS += $(LTO_FLAGS) $(PIEFLAGS) -g
 ifeq ($(OS),Windows_NT)
 LDLIBS += -lmpirxx -lmpir -lws2_32
-CXXFLAGS += $(LTO_FLAGS) -std=c++1z -D VDF_MODE=0 -D FAST_MACHINE=1 $(NOPIE) -fvisibility=hidden
+CXXFLAGS += $(LTO_FLAGS) -std=c++1z -D VDF_MODE=0 -D FAST_MACHINE=1 $(PIEFLAGS) $(PICFLAGS) -fvisibility=hidden
 else
 LDLIBS += -lgmpxx -lgmp -pthread
-CXXFLAGS += $(LTO_FLAGS) -std=c++1z -D VDF_MODE=0 -D FAST_MACHINE=1 -pthread $(NOPIE) -fvisibility=hidden
+CXXFLAGS += $(LTO_FLAGS) -std=c++1z -D VDF_MODE=0 -D FAST_MACHINE=1 -pthread $(PIEFLAGS) $(PICFLAGS) -fvisibility=hidden
 endif
+ASFLAGS += $(PICFLAGS)
 ifeq ($(UNAME),Darwin)
 CXXFLAGS += -D CHIAOSX=1
 # Homebrew (common on macOS) installs boost/gmp to /opt/homebrew or /usr/local
@@ -81,7 +92,7 @@ BINS = vdf_client prover_test 1weso_test 2weso_test vdf_bench
 all: $(BINS)
 
 clean:
-	rm -f *.o hw/*.o $(BINS) compile_asm emu_hw_test hw_test hw_vdf_client emu_hw_vdf_client
+	rm -f *.o hw/*.o c_bindings/*.o $(BINS) compile_asm emu_hw_test hw_test hw_vdf_client emu_hw_vdf_client libchiavdf_fastc.a
 
 $(BINS) avx512_test: %: %.o lzcnt.o $(ASM_OBJS)
 	$(CXX) $(LDFLAGS) -o $@ $^ $(LDLIBS)
@@ -91,6 +102,9 @@ $(addsuffix .o,$(BINS)) avx512_test.o: CXXFLAGS += $(OPT_CFLAGS)
 lzcnt.o: refcode/lzcnt.c
 	$(CC) $(CFLAGS) -c refcode/lzcnt.c
 
+%.o: %.s
+	$(CC) -c $< -o $@ $(ASFLAGS)
+
 asm_compiled.s: compile_asm
 	./compile_asm
 
@@ -104,6 +118,21 @@ compile_asm: compile_asm.o
 	$(CXX) $(LDFLAGS) -o $@ $^ $(LDLIBS)
 
 HW_OBJS = $(addprefix hw/,hw_util.o hw_proof.o hw_interface.o chia_driver.o ftdi_driver.o vdf_driver.o pll_freqs.o) vdf_base_hw.o vdf_hw_symbol_anchors.o prover_runtime.o lzcnt.o
+# ---------------------------------------------------------------------------
+# Static library: fast one-wesolowski proof (BBR integration)
+# ---------------------------------------------------------------------------
+
+FASTLIB = libchiavdf_fastc.a
+FASTLIB_OBJS = c_bindings/fast_wrapper.o lzcnt.o $(ASM_OBJS)
+
+.PHONY: fastlib
+
+fastlib: $(FASTLIB)
+
+$(FASTLIB): $(FASTLIB_OBJS)
+	$(AR) rcs $@ $^
+
+c_bindings/fast_wrapper.o: CXXFLAGS += $(OPT_CFLAGS)
 EMU_OBJS = hw/emu_funcs.o hw/emu_runner.o
 ifeq ($(OS),Windows_NT)
 HW_LIB = hw/libft4222/libft4222.lib
diff --git a/src/c_bindings/fast_wrapper.cpp b/src/c_bindings/fast_wrapper.cpp
new file mode 100644
index 00000000..198d0a87
--- /dev/null
+++ b/src/c_bindings/fast_wrapper.cpp
@@ -0,0 +1,795 @@
+#include "fast_wrapper.h"
+
+#include <atomic>
+#include <chrono>
+#include <limits>
+#include <mutex>
+#include <vector>
+
+#include "../vdf.h"
+#include "../create_discriminant.h"
+
+// Runtime configuration knobs required by `parameters.h`.
+// These are `extern` variables there, but each binary defines them explicitly.
+bool use_divide_table = false;
+int gcd_base_bits = 50;
+int gcd_128_max_iter = 3;
+std::string asmprefix = "cel_";
+bool enable_all_instructions = false;
+
+namespace {
+std::once_flag init_once;
+std::atomic<uint64_t> bucket_memory_budget_bytes(128ULL * 1024ULL * 1024ULL);
+std::atomic<bool> streaming_stats_enabled(false);
+
+struct LastStreamingParameters {
+    uint32_t k = 0;
+    uint32_t l = 0;
+    bool tuned = false;
+    bool set = false;
+};
+
+thread_local LastStreamingParameters last_streaming_parameters;
+
+struct LastStreamingStats {
+    uint64_t checkpoint_total_ns = 0;
+    uint64_t checkpoint_event_total_ns = 0;
+    uint64_t finalize_total_ns = 0;
+    uint64_t checkpoint_calls = 0;
+    uint64_t bucket_updates = 0;
+    bool set = false;
+};
+
+thread_local LastStreamingStats last_streaming_stats;
+
+void init_chiavdf_fast() {
+    init_gmp();
+    set_rounding_mode();
+
+    // Match the vdf_client runtime selection for AVX2.
+    if (hasAVX2()) {
+        gcd_base_bits = 63;
+        gcd_128_max_iter = 2;
+    } else {
+        gcd_base_bits = 50;
+        gcd_128_max_iter = 3;
+    }
+
+    // Ensure we run the one-wesolowski path by default.
+    fast_algorithm = false;
+    two_weso = false;
+    quiet_mode = true;
+}
+
+ChiavdfByteArray empty_result() { return ChiavdfByteArray{nullptr, 0}; }
+
+uint64_t estimate_bucket_form_bytes(size_t discriminant_size_bits) {
+    // Be conservative: class group forms contain 3 GMP-backed integers that
+    // quickly grow to the discriminant size (or beyond) during NUCOMP.
+    //
+    // This estimate is intentionally larger than the raw serialized size to
+    // avoid picking parameters that risk paging/OOM.
+    uint64_t discr_bytes = (static_cast<uint64_t>(discriminant_size_bits) + 7) / 8;
+    uint64_t estimate = discr_bytes * 16;
+    if (estimate < 2048) {
+        estimate = 2048;
+    }
+    return estimate;
+}
+
+bool tune_streaming_parameters(
+    uint64_t num_iterations,
+    size_t discriminant_size_bits,
+    uint64_t memory_budget_bytes,
+    uint32_t& out_l,
+    uint32_t& out_k) {
+    if (memory_budget_bytes == 0) {
+        return false;
+    }
+
+    // Keep headroom for GMP scratch allocations and general process overhead.
+    uint64_t budget = (memory_budget_bytes * 80) / 100;
+    uint64_t bytes_per_form = estimate_bucket_form_bytes(discriminant_size_bits);
+    if (budget < bytes_per_form) {
+        return false;
+    }
+
+    unsigned __int128 best_cost = std::numeric_limits<unsigned __int128>::max();
+    bool found = false;
+
+    // Empirical tuning notes (1024-bit discriminants, AVX2 build):
+    // - Each bucket update (NUCOMP) and each fold unit is ~5µs.
+    // - Per-checkpoint event overhead (SetForm + bookkeeping) is ~0.3µs.
+    //
+    // So checkpoint counts should be weighted much lower than updates/fold.
+    constexpr unsigned __int128 update_weight = 16;
+    constexpr unsigned __int128 fold_weight = 16;
+    constexpr unsigned __int128 checkpoint_weight = 1;
+
+    // Search a small grid of `(k,l)` values. Higher `k` reduces checkpoint work
+    // (~T/k) but increases fold work (~l·2^k) and bucket memory (~l·2^k).
+    for (uint32_t k = 4; k <= 20; k++) {
+        unsigned __int128 buckets_per_row = static_cast<unsigned __int128>(1) << k;
+
+        for (uint32_t l = 1; l <= 64; l++) {
+            unsigned __int128 form_count = buckets_per_row * static_cast<unsigned __int128>(l);
+            unsigned __int128 mem_required =
+                form_count * static_cast<unsigned __int128>(bytes_per_form);
+            if (mem_required > static_cast<unsigned __int128>(budget)) {
+                continue;
+            }
+
+            unsigned __int128 updates = static_cast<unsigned __int128>(
+                (num_iterations + static_cast<uint64_t>(k) - 1) / static_cast<uint64_t>(k));
+            uint64_t kl = static_cast<uint64_t>(k) * static_cast<uint64_t>(l);
+            unsigned __int128 checkpoints = static_cast<unsigned __int128>(
+                (num_iterations + kl - 1) / kl);
+            unsigned __int128 fold = static_cast<unsigned __int128>(l) << (k + 1);
+            unsigned __int128 cost =
+                updates * update_weight + checkpoints * checkpoint_weight + fold * fold_weight;
+
+            if (!found || cost < best_cost) {
+                found = true;
+                best_cost = cost;
+                out_k = k;
+                out_l = l;
+            }
+        }
+    }
+
+    return found;
+}
+
+uint64_t get_block(uint64_t i, uint64_t k, uint64_t T, integer& B) {
+    integer res = FastPow(2, T - k * (i + 1), B);
+    mpz_mul_2exp(res.impl, res.impl, k);
+    res = res / B;
+    auto res_vector = res.to_vector();
+    return res_vector.empty() ? 0 : res_vector[0];
+}
+
+class ProgressOneWesolowskiCallback final : public OneWesolowskiCallback {
+  public:
+    ProgressOneWesolowskiCallback(
+        integer& D,
+        form& f,
+        uint64_t wanted_iter,
+        uint64_t progress_interval,
+        ChiavdfProgressCallback progress_cb,
+        void* progress_user_data)
+        : OneWesolowskiCallback(D, f, wanted_iter),
+          progress_interval(progress_interval),
+          progress_cb(progress_cb),
+          progress_user_data(progress_user_data),
+          next_progress(progress_interval) {}
+
+    void OnIteration(int type, void* data, uint64_t iteration) override {
+        OneWesolowskiCallback::OnIteration(type, data, iteration);
+
+        if (progress_cb == nullptr || progress_interval == 0) {
+            return;
+        }
+
+        uint64_t done = iteration + 1;
+        if (done > wanted_iter) {
+            return;
+        }
+
+        if (done >= next_progress) {
+            progress_cb(next_progress, progress_user_data);
+            next_progress += progress_interval;
+        }
+    }
+
+  private:
+    uint64_t progress_interval;
+    ChiavdfProgressCallback progress_cb;
+    void* progress_user_data;
+    uint64_t next_progress;
+};
+
+class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
+  public:
+    StreamingOneWesolowskiCallback(
+        integer& D,
+        uint64_t wanted_iter,
+        uint32_t k,
+        uint32_t l,
+        uint64_t limit,
+        integer& B,
+        bool use_getblock_opt,
+        uint64_t progress_interval,
+        ChiavdfProgressCallback progress_cb,
+        void* progress_user_data)
+        : WesolowskiCallback(D),
+          wanted_iter(wanted_iter),
+          k(k),
+          l(l),
+          kl(static_cast<uint64_t>(k) * static_cast<uint64_t>(l)),
+          limit(limit),
+          B(B),
+          progress_interval(progress_interval),
+          progress_cb(progress_cb),
+          progress_user_data(progress_user_data),
+          next_progress(progress_interval),
+          use_getblock_opt(use_getblock_opt),
+          stats_enabled(streaming_stats_enabled.load(std::memory_order_relaxed)) {
+        form id = form::identity(D);
+        buckets.resize(static_cast<size_t>(l) * (1ULL << k), id);
+
+        if (use_getblock_opt) {
+            getblock_ok = init_getblock_opt_state();
+        }
+    }
+
+    void OnIteration(int type, void* data, uint64_t iteration) override {
+        iteration++;
+        if (iteration > wanted_iter) {
+            return;
+        }
+
+        if (progress_cb != nullptr && progress_interval != 0 && iteration >= next_progress) {
+            progress_cb(next_progress, progress_user_data);
+            next_progress += progress_interval;
+        }
+
+        if (iteration % kl == 0) {
+            uint64_t pos = iteration / kl;
+            if (pos < limit) {
+                form checkpoint;
+                auto started_at = std::chrono::steady_clock::time_point{};
+                if (stats_enabled) {
+                    started_at = std::chrono::steady_clock::now();
+                }
+                SetForm(type, data, &checkpoint);
+                process_checkpoint(pos, checkpoint, /*record_stats=*/true);
+                if (stats_enabled) {
+                    checkpoint_event_total_ns += static_cast<uint64_t>(
+                        std::chrono::duration_cast<std::chrono::nanoseconds>(
+                            std::chrono::steady_clock::now() - started_at)
+                            .count());
+                }
+            }
+        }
+
+        if (iteration == wanted_iter) {
+            SetForm(type, data, &result);
+            has_result = true;
+        }
+    }
+
+    void process_checkpoint(uint64_t i, const form& checkpoint, bool record_stats) {
+        const bool do_stats = stats_enabled && record_stats;
+        auto started_at = std::chrono::steady_clock::time_point{};
+        if (do_stats) {
+            started_at = std::chrono::steady_clock::now();
+        }
+
+        uint64_t local_updates = 0;
+        for (uint32_t j = 0; j < l; j++) {
+            uint64_t p = i * static_cast<uint64_t>(l) + static_cast<uint64_t>(j);
+            uint64_t needed = static_cast<uint64_t>(k) * (p + 1);
+            if (wanted_iter < needed) {
+                break;
+            }
+            uint64_t b = use_getblock_opt ? get_block_opt(p) : get_block(p, k, wanted_iter, B);
+            if (do_stats) {
+                local_updates++;
+            }
+            nucomp_form(bucket(j, b), bucket(j, b), checkpoint, D, L);
+        }
+
+        if (do_stats) {
+            checkpoint_calls++;
+            bucket_updates += local_updates;
+            checkpoint_total_ns += static_cast<uint64_t>(
+                std::chrono::duration_cast<std::chrono::nanoseconds>(
+                    std::chrono::steady_clock::now() - started_at)
+                    .count());
+        }
+    }
+
+    bool init_ok() const { return getblock_ok; }
+
+    bool ok() const { return has_result; }
+
+    const form& y() const { return result; }
+
+    form finalize_proof() {
+        auto started_at = std::chrono::steady_clock::time_point{};
+        if (stats_enabled) {
+            started_at = std::chrono::steady_clock::now();
+        }
+
+        PulmarkReducer reducer;
+        form id = form::identity(D);
+
+        uint64_t k1 = k / 2;
+        uint64_t k0 = k - k1;
+        form x = id;
+
+        for (int64_t j = static_cast<int64_t>(l) - 1; j >= 0; j--) {
+            x = FastPowFormNucomp(x, D, integer(static_cast<uint64_t>(1) << k), L, reducer);
+
+            for (uint64_t b1 = 0; b1 < (1ULL << k1); b1++) {
+                form z = id;
+                for (uint64_t b0 = 0; b0 < (1ULL << k0); b0++) {
+                    nucomp_form(z, z, bucket(static_cast<uint32_t>(j), b1 * (1ULL << k0) + b0), D, L);
+                }
+                z = FastPowFormNucomp(
+                    z,
+                    D,
+                    integer(static_cast<uint64_t>(b1 * (1ULL << k0))),
+                    L,
+                    reducer);
+                nucomp_form(x, x, z, D, L);
+            }
+
+            for (uint64_t b0 = 0; b0 < (1ULL << k0); b0++) {
+                form z = id;
+                for (uint64_t b1 = 0; b1 < (1ULL << k1); b1++) {
+                    nucomp_form(z, z, bucket(static_cast<uint32_t>(j), b1 * (1ULL << k0) + b0), D, L);
+                }
+                z = FastPowFormNucomp(z, D, integer(b0), L, reducer);
+                nucomp_form(x, x, z, D, L);
+            }
+        }
+
+        reducer.reduce(x);
+
+        if (stats_enabled) {
+            finalize_total_ns += static_cast<uint64_t>(
+                std::chrono::duration_cast<std::chrono::nanoseconds>(
+                    std::chrono::steady_clock::now() - started_at)
+                    .count());
+        }
+        return x;
+    }
+
+    bool stats_ok() const { return stats_enabled; }
+
+    LastStreamingStats stats() const {
+        LastStreamingStats out;
+        out.checkpoint_total_ns = checkpoint_total_ns;
+        out.checkpoint_event_total_ns = checkpoint_event_total_ns;
+        out.finalize_total_ns = finalize_total_ns;
+        out.checkpoint_calls = checkpoint_calls;
+        out.bucket_updates = bucket_updates;
+        out.set = stats_enabled;
+        return out;
+    }
+
+  private:
+    form& bucket(uint32_t j, uint64_t b) {
+        size_t idx = static_cast<size_t>(j) * (1ULL << k) + static_cast<size_t>(b);
+        return buckets[idx];
+    }
+
+    const form& bucket(uint32_t j, uint64_t b) const {
+        size_t idx = static_cast<size_t>(j) * (1ULL << k) + static_cast<size_t>(b);
+        return buckets[idx];
+    }
+
+    uint64_t wanted_iter;
+    uint32_t k;
+    uint32_t l;
+    uint64_t kl;
+    uint64_t limit;
+    integer B;
+    uint64_t progress_interval;
+    ChiavdfProgressCallback progress_cb;
+    void* progress_user_data;
+    uint64_t next_progress;
+
+    std::vector<form> buckets;
+    form result;
+    bool has_result = false;
+
+    bool use_getblock_opt;
+    bool getblock_ok = true;
+    uint64_t getblock_next_p = 0;
+    integer getblock_inv_2k;
+    integer getblock_r;
+    integer getblock_tmp;
+
+    bool stats_enabled;
+    uint64_t checkpoint_total_ns = 0;
+    uint64_t checkpoint_event_total_ns = 0;
+    uint64_t finalize_total_ns = 0;
+    uint64_t checkpoint_calls = 0;
+    uint64_t bucket_updates = 0;
+
+    bool init_getblock_opt_state() {
+        if (k == 0) {
+            return false;
+        }
+        uint64_t k_u64 = static_cast<uint64_t>(k);
+        if (wanted_iter < k_u64) {
+            return true;
+        }
+
+        integer two_k_mod = FastPow(2, k_u64, B);
+        if (mpz_invert(getblock_inv_2k.impl, two_k_mod.impl, B.impl) == 0) {
+            return false;
+        }
+
+        getblock_r = FastPow(2, wanted_iter - k_u64, B);
+        getblock_next_p = 0;
+        return true;
+    }
+
+    uint64_t get_block_opt(uint64_t p) {
+        if (!getblock_ok || wanted_iter < static_cast<uint64_t>(k)) {
+            return get_block(p, k, wanted_iter, B);
+        }
+
+        // Expected call pattern is sequential `p`. If we ever get out of sync,
+        // advance state forward or fall back to the slow mapping.
+        if (p < getblock_next_p) {
+            return get_block(p, k, wanted_iter, B);
+        }
+        while (getblock_next_p < p) {
+            mpz_mul(getblock_r.impl, getblock_r.impl, getblock_inv_2k.impl);
+            mpz_mod(getblock_r.impl, getblock_r.impl, B.impl);
+            getblock_next_p++;
+        }
+
+        mpz_mul_2exp(getblock_tmp.impl, getblock_r.impl, k);
+        mpz_fdiv_q(getblock_tmp.impl, getblock_tmp.impl, B.impl);
+        uint64_t b = mpz_get_ui(getblock_tmp.impl);
+
+        mpz_mul(getblock_r.impl, getblock_r.impl, getblock_inv_2k.impl);
+        mpz_mod(getblock_r.impl, getblock_r.impl, B.impl);
+        getblock_next_p++;
+
+        return b;
+    }
+};
+
+ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_impl(
+    const uint8_t* challenge_hash,
+    size_t challenge_size,
+    const uint8_t* x_s,
+    size_t x_s_size,
+    const uint8_t* y_ref_s,
+    size_t y_ref_s_size,
+    size_t discriminant_size_bits,
+    uint64_t num_iterations,
+    uint64_t progress_interval,
+    ChiavdfProgressCallback progress_cb,
+    void* progress_user_data,
+    bool use_getblock_opt) {
+    std::call_once(init_once, init_chiavdf_fast);
+
+    last_streaming_stats = LastStreamingStats{};
+
+    if (challenge_hash == nullptr || challenge_size == 0 || x_s == nullptr || x_s_size == 0 ||
+        y_ref_s == nullptr || y_ref_s_size == 0) {
+        return empty_result();
+    }
+    if (num_iterations == 0) {
+        return empty_result();
+    }
+
+    std::vector<uint8_t> challenge_hash_bytes(challenge_hash, challenge_hash + challenge_size);
+    integer D = CreateDiscriminant(challenge_hash_bytes, static_cast<int>(discriminant_size_bits));
+    integer L = root(-D, 4);
+
+    form x = DeserializeForm(D, x_s, x_s_size);
+    form y_ref = DeserializeForm(D, y_ref_s, y_ref_s_size);
+
+    uint32_t k;
+    uint32_t l;
+    bool tuned = false;
+    const uint64_t budget =
+        bucket_memory_budget_bytes.load(std::memory_order_relaxed);
+    if (num_iterations >= (1 << 16)) {
+        tuned = tune_streaming_parameters(num_iterations, discriminant_size_bits, budget, l, k);
+    }
+    if (!tuned) {
+        if (num_iterations >= (1 << 16)) {
+            ApproximateParameters(num_iterations, l, k);
+        } else {
+            k = 10;
+            l = 1;
+        }
+    }
+    if (k == 0) {
+        k = 1;
+    }
+    if (l == 0) {
+        l = 1;
+    }
+
+    last_streaming_parameters.k = k;
+    last_streaming_parameters.l = l;
+    last_streaming_parameters.tuned = tuned;
+    last_streaming_parameters.set = true;
+
+    uint64_t kl = static_cast<uint64_t>(k) * static_cast<uint64_t>(l);
+    uint64_t limit = num_iterations / kl;
+    if (num_iterations % kl) {
+        limit++;
+    }
+
+    integer B = GetB(D, x, y_ref);
+
+    std::atomic<bool> stopped(false);
+    StreamingOneWesolowskiCallback weso(
+        D,
+        num_iterations,
+        k,
+        l,
+        limit,
+        B,
+        use_getblock_opt,
+        progress_interval,
+        progress_cb,
+        progress_user_data);
+
+    if (!weso.init_ok()) {
+        return empty_result();
+    }
+
+    weso.process_checkpoint(/*i=*/0, x, /*record_stats=*/false);
+
+    FastStorage* fast_storage = nullptr;
+    repeated_square(num_iterations, x, D, L, &weso, fast_storage, stopped);
+
+    if (!weso.ok()) {
+        return empty_result();
+    }
+    if (!(weso.y() == y_ref)) {
+        return empty_result();
+    }
+
+    form proof_form = weso.finalize_proof();
+
+    if (weso.stats_ok()) {
+        last_streaming_stats = weso.stats();
+    }
+
+    int d_bits = D.num_bits();
+    std::vector<unsigned char> y_serialized = SerializeForm(y_ref, d_bits);
+    std::vector<unsigned char> proof_serialized = SerializeForm(proof_form, d_bits);
+
+    if (y_serialized.empty() || proof_serialized.empty()) {
+        return empty_result();
+    }
+
+    const size_t total = y_serialized.size() + proof_serialized.size();
+    uint8_t* out = new uint8_t[total];
+    std::copy(y_serialized.begin(), y_serialized.end(), out);
+    std::copy(proof_serialized.begin(), proof_serialized.end(), out + y_serialized.size());
+    return ChiavdfByteArray{out, total};
+}
+} // namespace
+
+extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast(
+    const uint8_t* challenge_hash,
+    size_t challenge_size,
+    const uint8_t* x_s,
+    size_t x_s_size,
+    size_t discriminant_size_bits,
+    uint64_t num_iterations) {
+    return chiavdf_prove_one_weso_fast_with_progress(
+        challenge_hash,
+        challenge_size,
+        x_s,
+        x_s_size,
+        discriminant_size_bits,
+        num_iterations,
+        /*progress_interval=*/0,
+        /*progress_cb=*/nullptr,
+        /*progress_user_data=*/nullptr);
+}
+
+extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_with_progress(
+    const uint8_t* challenge_hash,
+    size_t challenge_size,
+    const uint8_t* x_s,
+    size_t x_s_size,
+    size_t discriminant_size_bits,
+    uint64_t num_iterations,
+    uint64_t progress_interval,
+    ChiavdfProgressCallback progress_cb,
+    void* progress_user_data) {
+    try {
+        std::call_once(init_once, init_chiavdf_fast);
+
+        if (challenge_hash == nullptr || challenge_size == 0 || x_s == nullptr || x_s_size == 0) {
+            return empty_result();
+        }
+        if (num_iterations == 0) {
+            return empty_result();
+        }
+
+        std::vector<uint8_t> challenge_hash_bytes(challenge_hash, challenge_hash + challenge_size);
+        integer D = CreateDiscriminant(challenge_hash_bytes, static_cast<int>(discriminant_size_bits));
+        integer L = root(-D, 4);
+
+        form x = DeserializeForm(D, x_s, x_s_size);
+
+        std::atomic<bool> stopped(false);
+        ProgressOneWesolowskiCallback weso(
+            D,
+            x,
+            num_iterations,
+            progress_interval,
+            progress_cb,
+            progress_user_data);
+
+        // Run the fast repeated-squaring engine to `num_iterations`.
+        // The callback stores all intermediates needed for the proof.
+        FastStorage* fast_storage = nullptr;
+        repeated_square(num_iterations, x, D, L, &weso, fast_storage, stopped);
+
+        // Now generate the compact proof from the stored intermediates.
+        Proof proof = ProveOneWesolowski(num_iterations, D, x, &weso, stopped);
+        if (proof.y.empty() || proof.proof.empty()) {
+            return empty_result();
+        }
+
+        const size_t total = proof.y.size() + proof.proof.size();
+        uint8_t* out = new uint8_t[total];
+        std::copy(proof.y.begin(), proof.y.end(), out);
+        std::copy(proof.proof.begin(), proof.proof.end(), out + proof.y.size());
+        return ChiavdfByteArray{out, total};
+    } catch (...) {
+        return empty_result();
+    }
+}
+
+extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming(
+    const uint8_t* challenge_hash,
+    size_t challenge_size,
+    const uint8_t* x_s,
+    size_t x_s_size,
+    const uint8_t* y_ref_s,
+    size_t y_ref_s_size,
+    size_t discriminant_size_bits,
+    uint64_t num_iterations) {
+    return chiavdf_prove_one_weso_fast_streaming_with_progress(
+        challenge_hash,
+        challenge_size,
+        x_s,
+        x_s_size,
+        y_ref_s,
+        y_ref_s_size,
+        discriminant_size_bits,
+        num_iterations,
+        /*progress_interval=*/0,
+        /*progress_cb=*/nullptr,
+        /*progress_user_data=*/nullptr);
+}
+
+extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_with_progress(
+    const uint8_t* challenge_hash,
+    size_t challenge_size,
+    const uint8_t* x_s,
+    size_t x_s_size,
+    const uint8_t* y_ref_s,
+    size_t y_ref_s_size,
+    size_t discriminant_size_bits,
+    uint64_t num_iterations,
+    uint64_t progress_interval,
+    ChiavdfProgressCallback progress_cb,
+    void* progress_user_data) {
+    try {
+        return chiavdf_prove_one_weso_fast_streaming_impl(
+            challenge_hash,
+            challenge_size,
+            x_s,
+            x_s_size,
+            y_ref_s,
+            y_ref_s_size,
+            discriminant_size_bits,
+            num_iterations,
+            progress_interval,
+            progress_cb,
+            progress_user_data,
+            /*use_getblock_opt=*/false);
+    } catch (...) {
+        return empty_result();
+    }
+}
+
+extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt(
+    const uint8_t* challenge_hash,
+    size_t challenge_size,
+    const uint8_t* x_s,
+    size_t x_s_size,
+    const uint8_t* y_ref_s,
+    size_t y_ref_s_size,
+    size_t discriminant_size_bits,
+    uint64_t num_iterations) {
+    return chiavdf_prove_one_weso_fast_streaming_getblock_opt_with_progress(
+        challenge_hash,
+        challenge_size,
+        x_s,
+        x_s_size,
+        y_ref_s,
+        y_ref_s_size,
+        discriminant_size_bits,
+        num_iterations,
+        /*progress_interval=*/0,
+        /*progress_cb=*/nullptr,
+        /*progress_user_data=*/nullptr);
+}
+
+extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt_with_progress(
+    const uint8_t* challenge_hash,
+    size_t challenge_size,
+    const uint8_t* x_s,
+    size_t x_s_size,
+    const uint8_t* y_ref_s,
+    size_t y_ref_s_size,
+    size_t discriminant_size_bits,
+    uint64_t num_iterations,
+    uint64_t progress_interval,
+    ChiavdfProgressCallback progress_cb,
+    void* progress_user_data) {
+    try {
+        return chiavdf_prove_one_weso_fast_streaming_impl(
+            challenge_hash,
+            challenge_size,
+            x_s,
+            x_s_size,
+            y_ref_s,
+            y_ref_s_size,
+            discriminant_size_bits,
+            num_iterations,
+            progress_interval,
+            progress_cb,
+            progress_user_data,
+            /*use_getblock_opt=*/true);
+    } catch (...) {
+        return empty_result();
+    }
+}
+
+extern "C" void chiavdf_set_bucket_memory_budget_bytes(uint64_t bytes) {
+    bucket_memory_budget_bytes.store(bytes, std::memory_order_relaxed);
+}
+
+extern "C" void chiavdf_set_enable_streaming_stats(bool enable) {
+    streaming_stats_enabled.store(enable, std::memory_order_relaxed);
+    last_streaming_stats = LastStreamingStats{};
+}
+
+extern "C" bool chiavdf_get_last_streaming_parameters(uint32_t* out_k, uint32_t* out_l, bool* out_tuned) {
+    if (out_k == nullptr || out_l == nullptr || out_tuned == nullptr) {
+        return false;
+    }
+    if (!last_streaming_parameters.set) {
+        return false;
+    }
+    *out_k = last_streaming_parameters.k;
+    *out_l = last_streaming_parameters.l;
+    *out_tuned = last_streaming_parameters.tuned;
+    return true;
+}
+
+extern "C" bool chiavdf_get_last_streaming_stats(
+    uint64_t* out_checkpoint_total_ns,
+    uint64_t* out_checkpoint_event_total_ns,
+    uint64_t* out_finalize_total_ns,
+    uint64_t* out_checkpoint_calls,
+    uint64_t* out_bucket_updates) {
+    if (out_checkpoint_total_ns == nullptr || out_checkpoint_event_total_ns == nullptr ||
+        out_finalize_total_ns == nullptr || out_checkpoint_calls == nullptr ||
+        out_bucket_updates == nullptr) {
+        return false;
+    }
+    if (!last_streaming_stats.set) {
+        return false;
+    }
+    *out_checkpoint_total_ns = last_streaming_stats.checkpoint_total_ns;
+    *out_checkpoint_event_total_ns = last_streaming_stats.checkpoint_event_total_ns;
+    *out_finalize_total_ns = last_streaming_stats.finalize_total_ns;
+    *out_checkpoint_calls = last_streaming_stats.checkpoint_calls;
+    *out_bucket_updates = last_streaming_stats.bucket_updates;
+    return true;
+}
+
+extern "C" void chiavdf_free_byte_array(ChiavdfByteArray array) { delete[] array.data; }
diff --git a/src/c_bindings/fast_wrapper.h b/src/c_bindings/fast_wrapper.h
new file mode 100644
index 00000000..bf33f320
--- /dev/null
+++ b/src/c_bindings/fast_wrapper.h
@@ -0,0 +1,145 @@
+#pragma once
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+    uint8_t* data;
+    size_t length;
+} ChiavdfByteArray;
+
+typedef void (*ChiavdfProgressCallback)(uint64_t iters_done, void* user_data);
+
+// Configure the per-process memory budget used by the parameter tuner when
+// selecting `(k,l)` for streaming/bucket-based proving.
+//
+// The budget is per worker process (not global across multiple processes).
+//
+// If `bytes` is 0, the default chiavdf heuristic is used.
+void chiavdf_set_bucket_memory_budget_bytes(uint64_t bytes);
+
+// Debug helper: returns the `(k,l)` parameters selected for the most recent
+// streaming proof computed on the current thread.
+//
+// Returns true if parameters are available.
+bool chiavdf_get_last_streaming_parameters(uint32_t* out_k, uint32_t* out_l, bool* out_tuned);
+
+// Enable lightweight timing counters for the streaming prover.
+//
+// When enabled, the native library records basic timing counters for the most
+// recent streaming proof computed on the current thread. This is intended for
+// benchmarking and tuning; production runs should keep this disabled to avoid
+// extra overhead.
+void chiavdf_set_enable_streaming_stats(bool enable);
+
+// Debug helper: returns timing counters for the most recent streaming proof on
+// the current thread.
+//
+// Returns true if stats are available (i.e. stats enabled and a streaming proof
+// was computed successfully).
+bool chiavdf_get_last_streaming_stats(
+    uint64_t* out_checkpoint_total_ns,
+    uint64_t* out_checkpoint_event_total_ns,
+    uint64_t* out_finalize_total_ns,
+    uint64_t* out_checkpoint_calls,
+    uint64_t* out_bucket_updates);
+
+// Computes a compact (witness_type=0) Wesolowski proof using the fast engine.
+//
+// On success, returns `y || proof` where:
+// - `y` is the serialized output form (typically 100 bytes for 1024-bit discriminants)
+// - `proof` is the serialized witness form (same size as `y`)
+//
+// On failure, returns `{NULL, 0}`.
+ChiavdfByteArray chiavdf_prove_one_weso_fast(
+    const uint8_t* challenge_hash,
+    size_t challenge_size,
+    const uint8_t* x_s,
+    size_t x_s_size,
+    size_t discriminant_size_bits,
+    uint64_t num_iterations);
+
+// Same as `chiavdf_prove_one_weso_fast`, but optionally invokes `progress_cb` from
+// the proving thread every `progress_interval` iterations completed.
+//
+// If `progress_cb` is NULL or `progress_interval` is 0, no progress is reported.
+ChiavdfByteArray chiavdf_prove_one_weso_fast_with_progress(
+    const uint8_t* challenge_hash,
+    size_t challenge_size,
+    const uint8_t* x_s,
+    size_t x_s_size,
+    size_t discriminant_size_bits,
+    uint64_t num_iterations,
+    uint64_t progress_interval,
+    ChiavdfProgressCallback progress_cb,
+    void* progress_user_data);
+
+// Computes a compact (witness_type=0) Wesolowski proof using the "streaming"
+// bucket-accumulation algorithm (Trick 1), which requires the expected output
+// `y_ref` up front (as used by bluebox compaction jobs).
+//
+// On success, returns `y || proof` (same format as `chiavdf_prove_one_weso_fast`).
+ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming(
+    const uint8_t* challenge_hash,
+    size_t challenge_size,
+    const uint8_t* x_s,
+    size_t x_s_size,
+    const uint8_t* y_ref_s,
+    size_t y_ref_s_size,
+    size_t discriminant_size_bits,
+    uint64_t num_iterations);
+
+// Same as `chiavdf_prove_one_weso_fast_streaming`, but optionally invokes
+// `progress_cb` from the proving thread every `progress_interval` iterations.
+ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_with_progress(
+    const uint8_t* challenge_hash,
+    size_t challenge_size,
+    const uint8_t* x_s,
+    size_t x_s_size,
+    const uint8_t* y_ref_s,
+    size_t y_ref_s_size,
+    size_t discriminant_size_bits,
+    uint64_t num_iterations,
+    uint64_t progress_interval,
+    ChiavdfProgressCallback progress_cb,
+    void* progress_user_data);
+
+// Same as `chiavdf_prove_one_weso_fast_streaming`, but with an optimized
+// implementation of the `GetBlock()` mapping (avoids per-block modular
+// exponentiation without allocating a full `GetBlock` table).
+ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt(
+    const uint8_t* challenge_hash,
+    size_t challenge_size,
+    const uint8_t* x_s,
+    size_t x_s_size,
+    const uint8_t* y_ref_s,
+    size_t y_ref_s_size,
+    size_t discriminant_size_bits,
+    uint64_t num_iterations);
+
+// Same as `chiavdf_prove_one_weso_fast_streaming_getblock_opt`, but optionally
+// invokes `progress_cb` from the proving thread every `progress_interval`
+// iterations.
+ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt_with_progress(
+    const uint8_t* challenge_hash,
+    size_t challenge_size,
+    const uint8_t* x_s,
+    size_t x_s_size,
+    const uint8_t* y_ref_s,
+    size_t y_ref_s_size,
+    size_t discriminant_size_bits,
+    uint64_t num_iterations,
+    uint64_t progress_interval,
+    ChiavdfProgressCallback progress_cb,
+    void* progress_user_data);
+
+void chiavdf_free_byte_array(ChiavdfByteArray array);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/threading.h b/src/threading.h
index 3244b3c3..8354d824 100644
--- a/src/threading.h
+++ b/src/threading.h
@@ -566,8 +566,8 @@ struct alignas(64) thread_counter {
     }
 };
 
-thread_counter master_counter[100];
-thread_counter slave_counter[100];
+thread_counter master_counter[512];
+thread_counter slave_counter[512];
 
 struct thread_state {
     int pairindex;
diff --git a/src/vdf.h b/src/vdf.h
index 7bb911f9..f24c09c6 100644
--- a/src/vdf.h
+++ b/src/vdf.h
@@ -87,6 +87,18 @@ std::mutex new_event_mutex, cout_lock;
 bool debug_mode = false;
 bool fast_algorithm = false;
 bool two_weso = false;
+bool quiet_mode = false;
+
+// vdf_fast uses shared master/slave counters keyed by `square_state.pairindex`.
+// The upstream chiavdf binaries run one VDF per process and hardcode `pairindex=0`.
+// In embedded/multi-worker setups (like WesoForge), multiple VDF computations can
+// run concurrently in the same process; they must not share a pairindex.
+inline int vdf_fast_pairindex() {
+    constexpr int kSlots = int(sizeof(master_counter) / sizeof(master_counter[0]));
+    static std::atomic<int> next_slot{0};
+    thread_local int slot = next_slot.fetch_add(1, std::memory_order_relaxed) % kSlots;
+    return slot;
+}
 
 //always works
 void repeated_square_original(vdf_original &vdfo, form& f, const integer&, const integer&, uint64 base, uint64 iterations, INUDUPLListener *nuduplListener) {
@@ -195,7 +207,7 @@ void repeated_square(uint64_t iterations, form f, const integer& D, const intege
 #if (defined(ARCH_X86) || defined(ARCH_X64)) && !defined(CHIA_DISABLE_ASM)
         // x86/x64: use the phased pipeline.
         square_state_type square_state;
-        square_state.pairindex = 0;
+        square_state.pairindex = vdf_fast_pairindex();
         actual_iterations = repeated_square_fast(square_state, f, D, L, num_iterations, batch_size, weso);
 #else
         // Non-x86: use the C++ NUDUPL path (faster and lower maintenance than the phased pipeline).
@@ -298,10 +310,12 @@ void repeated_square(uint64_t iterations, form f, const integer& D, const intege
             }
         #endif
     }
-    {
-        // this shouldn't be needed but avoids some false positive in TSAN
-        std::lock_guard<std::mutex> lk(cout_lock);
-        std::cout << "VDF loop finished. Total iters: " << num_iterations << "\n" << std::flush;
+    if (!quiet_mode) {
+        {
+            // this shouldn't be needed but avoids some false positive in TSAN
+            std::lock_guard<std::mutex> lk(cout_lock);
+            std::cout << "VDF loop finished. Total iters: " << num_iterations << "\n" << std::flush;
+        }
     }
 
     #ifdef VDF_TEST
@@ -337,11 +351,6 @@ Proof ProveOneWesolowski(uint64_t iters, integer& D, form f, OneWesolowskiCallba
     proof_serialized = SerializeForm(proof_form, d_bits);
     Proof proof(y_serialized, proof_serialized);
     proof.witness_type = 0;
-    {
-        // this shouldn't be needed but avoids some false positive in TSAN
-        std::lock_guard<std::mutex> lk(cout_lock);
-        std::cout << "Got simple weso proof: " << proof.hex() << "\n";
-    }
     return proof;
 }
 

From 7be07522d02eca9fc65ca9b34b96eb2057659e76 Mon Sep 17 00:00:00 2001
From: Gene Hoffman <hoffmang@hoffmang.com>
Date: Mon, 23 Feb 2026 23:41:52 -0800
Subject: [PATCH 02/13] Fix non-x86 build break in vdf_fast_pairindex.

Guard the fast pairindex slot selection behind the existing x86/asm feature checks and return slot 0 on non-x86 targets, where threading counters are not compiled.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 src/vdf.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/vdf.h b/src/vdf.h
index f24c09c6..c2f8834f 100644
--- a/src/vdf.h
+++ b/src/vdf.h
@@ -94,10 +94,14 @@ bool quiet_mode = false;
 // In embedded/multi-worker setups (like WesoForge), multiple VDF computations can
 // run concurrently in the same process; they must not share a pairindex.
 inline int vdf_fast_pairindex() {
+#if (defined(ARCH_X86) || defined(ARCH_X64)) && !defined(CHIA_DISABLE_ASM)
     constexpr int kSlots = int(sizeof(master_counter) / sizeof(master_counter[0]));
     static std::atomic<int> next_slot{0};
     thread_local int slot = next_slot.fetch_add(1, std::memory_order_relaxed) % kSlots;
     return slot;
+#else
+    return 0;
+#endif
 }
 
 //always works

From 3755be28167c172b1bb6115b081d1d23903a6d98 Mon Sep 17 00:00:00 2001
From: Gene Hoffman <hoffmang@hoffmang.com>
Date: Mon, 23 Feb 2026 23:57:04 -0800
Subject: [PATCH 03/13] Ensure cmake is present on macOS CI runners.

Install cmake via Homebrew and export its bin path in the C libraries and wheel workflows so self-hosted macOS jobs don't fail when cmake is missing from PATH.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .github/workflows/build-c-libraries.yml | 11 +++++++++++
 .github/workflows/build.yml             | 11 +++++++++++
 2 files changed, 22 insertions(+)

diff --git a/.github/workflows/build-c-libraries.yml b/.github/workflows/build-c-libraries.yml
index 00ca38c9..db833104 100644
--- a/.github/workflows/build-c-libraries.yml
+++ b/.github/workflows/build-c-libraries.yml
@@ -82,6 +82,17 @@ jobs:
         fetch-depth: 1
         path: mpir_gc_x64
 
+    - name: Ensure cmake available (macOS)
+      if: matrix.os.matrix == 'macos'
+      shell: bash
+      run: |
+        brew ls --versions cmake >/dev/null 2>&1 || brew install cmake
+        CMAKE_BIN="$(brew --prefix cmake)/bin"
+        if [ -d "$CMAKE_BIN" ]; then
+          echo "$CMAKE_BIN" >> "$GITHUB_PATH"
+        fi
+        cmake --version
+
     - name: Build
       working-directory: src
       env:
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index cd6bec02..4ad967ec 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -102,6 +102,17 @@ jobs:
       with:
         python-version: ${{ matrix.python.major-dot-minor }}
 
+    - name: Ensure cmake available (macOS)
+      if: matrix.os.matrix == 'macos'
+      shell: bash
+      run: |
+        brew ls --versions cmake >/dev/null 2>&1 || brew install cmake
+        CMAKE_BIN="$(brew --prefix cmake)/bin"
+        if [ -d "$CMAKE_BIN" ]; then
+          echo "$CMAKE_BIN" >> "$GITHUB_PATH"
+        fi
+        cmake --version
+
     - name: Install pipx
       run: |
         pip install pipx

From 073427ac5cc60adea976104e1aff47bd9fb2ccc4 Mon Sep 17 00:00:00 2001
From: Gene Hoffman <hoffmang@hoffmang.com>
Date: Tue, 24 Feb 2026 00:00:51 -0800
Subject: [PATCH 04/13] Improve fast-path batch replay handling and harden
 pairindex slot allocation.

Track and roll back per-batch checkpoints when replaying a failed fast batch, and switch pairindex slot allocation to unsigned atomics to avoid negative modulo indexing after counter wraparound.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 src/c_bindings/fast_wrapper.cpp | 44 ++++++++++++++++++++++++++++++++-
 src/callback.h                  |  8 ++++++
 src/vdf.h                       | 12 ++++++---
 3 files changed, 60 insertions(+), 4 deletions(-)

diff --git a/src/c_bindings/fast_wrapper.cpp b/src/c_bindings/fast_wrapper.cpp
index 198d0a87..5f01e905 100644
--- a/src/c_bindings/fast_wrapper.cpp
+++ b/src/c_bindings/fast_wrapper.cpp
@@ -243,6 +243,9 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
                 }
                 SetForm(type, data, &checkpoint);
                 process_checkpoint(pos, checkpoint, /*record_stats=*/true);
+                if (iteration >= batch_start_iteration && iteration <= batch_end_iteration) {
+                    current_batch_checkpoints.push_back(BatchCheckpoint{pos, checkpoint});
+                }
                 if (stats_enabled) {
                     checkpoint_event_total_ns += static_cast<uint64_t>(
                         std::chrono::duration_cast<std::chrono::nanoseconds>(
@@ -258,7 +261,44 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
         }
     }
 
+    void OnBatchStart(uint64_t base_iteration, uint64_t batch_size) override {
+        current_batch_checkpoints.clear();
+        if (batch_size == 0) {
+            batch_start_iteration = 1;
+            batch_end_iteration = 0;
+            return;
+        }
+        batch_start_iteration = base_iteration + 1;
+        if (std::numeric_limits<uint64_t>::max() - base_iteration < batch_size) {
+            batch_end_iteration = std::numeric_limits<uint64_t>::max();
+        } else {
+            batch_end_iteration = base_iteration + batch_size;
+        }
+    }
+
+    void OnBatchReplay(uint64_t base_iteration, uint64_t batch_size) override {
+        for (const BatchCheckpoint& entry : current_batch_checkpoints) {
+            rollback_checkpoint(entry.index, entry.checkpoint);
+        }
+        OnBatchStart(base_iteration, batch_size);
+    }
+
     void process_checkpoint(uint64_t i, const form& checkpoint, bool record_stats) {
+        apply_checkpoint(i, checkpoint, record_stats);
+    }
+
+  private:
+    struct BatchCheckpoint {
+        uint64_t index;
+        form checkpoint;
+    };
+
+    void rollback_checkpoint(uint64_t i, const form& checkpoint) {
+        form inverse_checkpoint = checkpoint.inverse();
+        apply_checkpoint(i, inverse_checkpoint, /*record_stats=*/false);
+    }
+
+    void apply_checkpoint(uint64_t i, const form& checkpoint, bool record_stats) {
         const bool do_stats = stats_enabled && record_stats;
         auto started_at = std::chrono::steady_clock::time_point{};
         if (do_stats) {
@@ -359,7 +399,6 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
         return out;
     }
 
-  private:
     form& bucket(uint32_t j, uint64_t b) {
         size_t idx = static_cast<size_t>(j) * (1ULL << k) + static_cast<size_t>(b);
         return buckets[idx];
@@ -391,6 +430,9 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
     integer getblock_inv_2k;
     integer getblock_r;
     integer getblock_tmp;
+    uint64_t batch_start_iteration = 1;
+    uint64_t batch_end_iteration = 0;
+    std::vector<BatchCheckpoint> current_batch_checkpoints;
 
     bool stats_enabled;
     uint64_t checkpoint_total_ns = 0;
diff --git a/src/callback.h b/src/callback.h
index f4764bbf..9ebf3543 100644
--- a/src/callback.h
+++ b/src/callback.h
@@ -73,6 +73,14 @@ class WesolowskiCallback :public INUDUPLListener {
     }
 
     virtual void OnIteration(int type, void *data, uint64_t iteration) = 0;
+    virtual void OnBatchStart(uint64_t base_iteration, uint64_t batch_size) {
+        (void)base_iteration;
+        (void)batch_size;
+    }
+    virtual void OnBatchReplay(uint64_t base_iteration, uint64_t batch_size) {
+        (void)base_iteration;
+        (void)batch_size;
+    }
 
     std::unique_ptr<form[]> forms;
     size_t forms_capacity = 0;
diff --git a/src/vdf.h b/src/vdf.h
index c2f8834f..8ca75d8c 100644
--- a/src/vdf.h
+++ b/src/vdf.h
@@ -95,9 +95,9 @@ bool quiet_mode = false;
 // run concurrently in the same process; they must not share a pairindex.
 inline int vdf_fast_pairindex() {
 #if (defined(ARCH_X86) || defined(ARCH_X64)) && !defined(CHIA_DISABLE_ASM)
-    constexpr int kSlots = int(sizeof(master_counter) / sizeof(master_counter[0]));
-    static std::atomic<int> next_slot{0};
-    thread_local int slot = next_slot.fetch_add(1, std::memory_order_relaxed) % kSlots;
+    constexpr unsigned int kSlots = unsigned(sizeof(master_counter) / sizeof(master_counter[0]));
+    static std::atomic<unsigned int> next_slot{0};
+    thread_local int slot = int(next_slot.fetch_add(1u, std::memory_order_relaxed) % kSlots);
     return slot;
 #else
     return 0;
@@ -201,6 +201,9 @@ void repeated_square(uint64_t iterations, form f, const integer& D, const intege
         #endif
 
         uint64 batch_size=c_checkpoint_interval;
+        if (weso != NULL) {
+            weso->OnBatchStart(num_iterations, batch_size);
+        }
 
         #ifdef ENABLE_TRACK_CYCLES
             print( "track cycles enabled; results will be wrong" );
@@ -231,6 +234,9 @@ void repeated_square(uint64_t iterations, form f, const integer& D, const intege
 
         if (actual_iterations==~uint64(0)) {
             //corruption; f is unchanged. do the entire batch with the slow algorithm
+            if (weso != NULL) {
+                weso->OnBatchReplay(num_iterations, batch_size);
+            }
             repeated_square_original(*weso->vdfo, f, D, L, num_iterations, batch_size, weso);
             actual_iterations=batch_size;
 

From fd000ab88ded1cc3386acd892401cb626ae3cc14 Mon Sep 17 00:00:00 2001
From: Gene Hoffman <hoffmang@hoffmang.com>
Date: Tue, 24 Feb 2026 00:33:22 -0800
Subject: [PATCH 05/13] Clarify batch iteration indexing in streaming callback.

Document that batch bounds use completed-iteration base values while OnIteration is normalized to 1-based indices to avoid ambiguity in replay tracking.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 src/c_bindings/fast_wrapper.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/c_bindings/fast_wrapper.cpp b/src/c_bindings/fast_wrapper.cpp
index 5f01e905..61d24599 100644
--- a/src/c_bindings/fast_wrapper.cpp
+++ b/src/c_bindings/fast_wrapper.cpp
@@ -268,6 +268,8 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
             batch_end_iteration = 0;
             return;
         }
+        // `base_iteration` is the number of completed iterations before this batch.
+        // `OnIteration` normalizes to 1-based (`iteration++`), so this batch is [base+1, base+size].
         batch_start_iteration = base_iteration + 1;
         if (std::numeric_limits<uint64_t>::max() - base_iteration < batch_size) {
             batch_end_iteration = std::numeric_limits<uint64_t>::max();

From 3f82dc2786903e0029d92fc36d5f95f6a04d94bc Mon Sep 17 00:00:00 2001
From: Gene Hoffman <hoffmang@hoffmang.com>
Date: Tue, 24 Feb 2026 00:48:29 -0800
Subject: [PATCH 06/13] Add streaming tuner diagnostics and batch fast-wrapper
 APIs.

Expose missing batch C bindings and debug visibility so downstream Rust tests can validate tuner behavior end-to-end.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 pr1_upstream_ready.patch        | 1158 +++++++++++++++++++++++++++++++
 src/c_bindings/fast_wrapper.cpp |  168 +++++
 src/c_bindings/fast_wrapper.h   |   32 +
 3 files changed, 1358 insertions(+)
 create mode 100644 pr1_upstream_ready.patch

diff --git a/pr1_upstream_ready.patch b/pr1_upstream_ready.patch
new file mode 100644
index 00000000..b14a93bb
--- /dev/null
+++ b/pr1_upstream_ready.patch
@@ -0,0 +1,1158 @@
+diff --git a/src/Makefile.vdf-client b/src/Makefile.vdf-client
+index ed41963..ca55a95 100644
+--- a/src/Makefile.vdf-client
++++ b/src/Makefile.vdf-client
+@@ -6,9 +6,24 @@ else
+ NOPIE = -no-pie
+ endif
+ 
+-LDFLAGS += -flto $(NOPIE) -g
++# Optional: override `LTO=` to disable link-time optimization.
++LTO ?= -flto
++
++# Optional: set `PIC=1` to build position-independent objects (recommended when
++# linking chiavdf code into other PIE/shared-library binaries).
++PIC ?= 0
++ifeq ($(PIC),1)
++PICFLAGS = -fPIC
++PIEFLAGS =
++else
++PICFLAGS =
++PIEFLAGS = $(NOPIE)
++endif
++
++LDFLAGS += $(LTO) $(PIEFLAGS) -g
+ LDLIBS += -lgmpxx -lgmp -pthread
+-CXXFLAGS += -flto -std=c++1z -D VDF_MODE=0 -D FAST_MACHINE=1 -pthread $(NOPIE) -fvisibility=hidden
++CXXFLAGS += $(LTO) -std=c++1z -D VDF_MODE=0 -D FAST_MACHINE=1 -pthread $(PIEFLAGS) $(PICFLAGS) -fvisibility=hidden
++ASFLAGS += $(PICFLAGS)
+ ifeq ($(UNAME),Darwin)
+ CXXFLAGS += -D CHIAOSX=1
+ endif
+@@ -31,7 +46,7 @@ BINS = vdf_client prover_test 1weso_test 2weso_test vdf_bench
+ all: $(BINS)
+ 
+ clean:
+-	rm -f *.o hw/*.o $(BINS) compile_asm emu_hw_test hw_test hw_vdf_client emu_hw_vdf_client
++	rm -f *.o hw/*.o c_bindings/*.o $(BINS) compile_asm emu_hw_test hw_test hw_vdf_client emu_hw_vdf_client libchiavdf_fastc.a
+ 
+ $(BINS) avx512_test: %: %.o lzcnt.o asm_compiled.o avx2_asm_compiled.o avx512_asm_compiled.o
+ 	$(CXX) $(LDFLAGS) -o $@ $^ $(LDLIBS)
+@@ -39,7 +54,10 @@ $(BINS) avx512_test: %: %.o lzcnt.o asm_compiled.o avx2_asm_compiled.o avx512_as
+ $(addsuffix .o,$(BINS)) avx512_test.o: CXXFLAGS += $(OPT_CFLAGS)
+ 
+ lzcnt.o: refcode/lzcnt.c
+-	$(CC) -c refcode/lzcnt.c
++	$(CC) -c refcode/lzcnt.c $(OPT_CFLAGS) $(PICFLAGS)
++
++%.o: %.s
++	$(CC) -c $< -o $@ $(ASFLAGS)
+ 
+ asm_compiled.s: compile_asm
+ 	./compile_asm
+@@ -53,6 +71,22 @@ avx512_asm_compiled.s: compile_asm
+ compile_asm: compile_asm.o
+ 	$(CXX) $(LDFLAGS) -o $@ $^ $(LDLIBS)
+ 
++# ---------------------------------------------------------------------------
++# Static library: fast one-wesolowski proof (BBR integration)
++# ---------------------------------------------------------------------------
++
++FASTLIB = libchiavdf_fastc.a
++FASTLIB_OBJS = c_bindings/fast_wrapper.o lzcnt.o asm_compiled.o avx2_asm_compiled.o avx512_asm_compiled.o
++
++.PHONY: fastlib
++
++fastlib: $(FASTLIB)
++
++$(FASTLIB): $(FASTLIB_OBJS)
++	$(AR) rcs $@ $^
++
++c_bindings/fast_wrapper.o: CXXFLAGS += $(OPT_CFLAGS)
++
+ HW_OBJS = $(addprefix hw/,hw_util.o hw_proof.o hw_interface.o chia_driver.o ftdi_driver.o vdf_driver.o pll_freqs.o) vdf_base.o lzcnt.o
+ EMU_OBJS = hw/emu_funcs.o hw/emu_runner.o
+ HW_LIB = hw/libft4222/build-x86_64/libft4222.so
+diff --git a/src/c_bindings/fast_wrapper.cpp b/src/c_bindings/fast_wrapper.cpp
+new file mode 100644
+index 0000000..198d0a8
+--- /dev/null
++++ b/src/c_bindings/fast_wrapper.cpp
+@@ -0,0 +1,795 @@
++#include "fast_wrapper.h"
++
++#include <atomic>
++#include <chrono>
++#include <limits>
++#include <mutex>
++#include <vector>
++
++#include "../vdf.h"
++#include "../create_discriminant.h"
++
++// Runtime configuration knobs required by `parameters.h`.
++// These are `extern` variables there, but each binary defines them explicitly.
++bool use_divide_table = false;
++int gcd_base_bits = 50;
++int gcd_128_max_iter = 3;
++std::string asmprefix = "cel_";
++bool enable_all_instructions = false;
++
++namespace {
++std::once_flag init_once;
++std::atomic<uint64_t> bucket_memory_budget_bytes(128ULL * 1024ULL * 1024ULL);
++std::atomic<bool> streaming_stats_enabled(false);
++
++struct LastStreamingParameters {
++    uint32_t k = 0;
++    uint32_t l = 0;
++    bool tuned = false;
++    bool set = false;
++};
++
++thread_local LastStreamingParameters last_streaming_parameters;
++
++struct LastStreamingStats {
++    uint64_t checkpoint_total_ns = 0;
++    uint64_t checkpoint_event_total_ns = 0;
++    uint64_t finalize_total_ns = 0;
++    uint64_t checkpoint_calls = 0;
++    uint64_t bucket_updates = 0;
++    bool set = false;
++};
++
++thread_local LastStreamingStats last_streaming_stats;
++
++void init_chiavdf_fast() {
++    init_gmp();
++    set_rounding_mode();
++
++    // Match the vdf_client runtime selection for AVX2.
++    if (hasAVX2()) {
++        gcd_base_bits = 63;
++        gcd_128_max_iter = 2;
++    } else {
++        gcd_base_bits = 50;
++        gcd_128_max_iter = 3;
++    }
++
++    // Ensure we run the one-wesolowski path by default.
++    fast_algorithm = false;
++    two_weso = false;
++    quiet_mode = true;
++}
++
++ChiavdfByteArray empty_result() { return ChiavdfByteArray{nullptr, 0}; }
++
++uint64_t estimate_bucket_form_bytes(size_t discriminant_size_bits) {
++    // Be conservative: class group forms contain 3 GMP-backed integers that
++    // quickly grow to the discriminant size (or beyond) during NUCOMP.
++    //
++    // This estimate is intentionally larger than the raw serialized size to
++    // avoid picking parameters that risk paging/OOM.
++    uint64_t discr_bytes = (static_cast<uint64_t>(discriminant_size_bits) + 7) / 8;
++    uint64_t estimate = discr_bytes * 16;
++    if (estimate < 2048) {
++        estimate = 2048;
++    }
++    return estimate;
++}
++
++bool tune_streaming_parameters(
++    uint64_t num_iterations,
++    size_t discriminant_size_bits,
++    uint64_t memory_budget_bytes,
++    uint32_t& out_l,
++    uint32_t& out_k) {
++    if (memory_budget_bytes == 0) {
++        return false;
++    }
++
++    // Keep headroom for GMP scratch allocations and general process overhead.
++    uint64_t budget = (memory_budget_bytes * 80) / 100;
++    uint64_t bytes_per_form = estimate_bucket_form_bytes(discriminant_size_bits);
++    if (budget < bytes_per_form) {
++        return false;
++    }
++
++    unsigned __int128 best_cost = std::numeric_limits<unsigned __int128>::max();
++    bool found = false;
++
++    // Empirical tuning notes (1024-bit discriminants, AVX2 build):
++    // - Each bucket update (NUCOMP) and each fold unit is ~5µs.
++    // - Per-checkpoint event overhead (SetForm + bookkeeping) is ~0.3µs.
++    //
++    // So checkpoint counts should be weighted much lower than updates/fold.
++    constexpr unsigned __int128 update_weight = 16;
++    constexpr unsigned __int128 fold_weight = 16;
++    constexpr unsigned __int128 checkpoint_weight = 1;
++
++    // Search a small grid of `(k,l)` values. Higher `k` reduces checkpoint work
++    // (~T/k) but increases fold work (~l·2^k) and bucket memory (~l·2^k).
++    for (uint32_t k = 4; k <= 20; k++) {
++        unsigned __int128 buckets_per_row = static_cast<unsigned __int128>(1) << k;
++
++        for (uint32_t l = 1; l <= 64; l++) {
++            unsigned __int128 form_count = buckets_per_row * static_cast<unsigned __int128>(l);
++            unsigned __int128 mem_required =
++                form_count * static_cast<unsigned __int128>(bytes_per_form);
++            if (mem_required > static_cast<unsigned __int128>(budget)) {
++                continue;
++            }
++
++            unsigned __int128 updates = static_cast<unsigned __int128>(
++                (num_iterations + static_cast<uint64_t>(k) - 1) / static_cast<uint64_t>(k));
++            uint64_t kl = static_cast<uint64_t>(k) * static_cast<uint64_t>(l);
++            unsigned __int128 checkpoints = static_cast<unsigned __int128>(
++                (num_iterations + kl - 1) / kl);
++            unsigned __int128 fold = static_cast<unsigned __int128>(l) << (k + 1);
++            unsigned __int128 cost =
++                updates * update_weight + checkpoints * checkpoint_weight + fold * fold_weight;
++
++            if (!found || cost < best_cost) {
++                found = true;
++                best_cost = cost;
++                out_k = k;
++                out_l = l;
++            }
++        }
++    }
++
++    return found;
++}
++
++uint64_t get_block(uint64_t i, uint64_t k, uint64_t T, integer& B) {
++    integer res = FastPow(2, T - k * (i + 1), B);
++    mpz_mul_2exp(res.impl, res.impl, k);
++    res = res / B;
++    auto res_vector = res.to_vector();
++    return res_vector.empty() ? 0 : res_vector[0];
++}
++
++class ProgressOneWesolowskiCallback final : public OneWesolowskiCallback {
++  public:
++    ProgressOneWesolowskiCallback(
++        integer& D,
++        form& f,
++        uint64_t wanted_iter,
++        uint64_t progress_interval,
++        ChiavdfProgressCallback progress_cb,
++        void* progress_user_data)
++        : OneWesolowskiCallback(D, f, wanted_iter),
++          progress_interval(progress_interval),
++          progress_cb(progress_cb),
++          progress_user_data(progress_user_data),
++          next_progress(progress_interval) {}
++
++    void OnIteration(int type, void* data, uint64_t iteration) override {
++        OneWesolowskiCallback::OnIteration(type, data, iteration);
++
++        if (progress_cb == nullptr || progress_interval == 0) {
++            return;
++        }
++
++        uint64_t done = iteration + 1;
++        if (done > wanted_iter) {
++            return;
++        }
++
++        if (done >= next_progress) {
++            progress_cb(next_progress, progress_user_data);
++            next_progress += progress_interval;
++        }
++    }
++
++  private:
++    uint64_t progress_interval;
++    ChiavdfProgressCallback progress_cb;
++    void* progress_user_data;
++    uint64_t next_progress;
++};
++
++class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
++  public:
++    StreamingOneWesolowskiCallback(
++        integer& D,
++        uint64_t wanted_iter,
++        uint32_t k,
++        uint32_t l,
++        uint64_t limit,
++        integer& B,
++        bool use_getblock_opt,
++        uint64_t progress_interval,
++        ChiavdfProgressCallback progress_cb,
++        void* progress_user_data)
++        : WesolowskiCallback(D),
++          wanted_iter(wanted_iter),
++          k(k),
++          l(l),
++          kl(static_cast<uint64_t>(k) * static_cast<uint64_t>(l)),
++          limit(limit),
++          B(B),
++          progress_interval(progress_interval),
++          progress_cb(progress_cb),
++          progress_user_data(progress_user_data),
++          next_progress(progress_interval),
++          use_getblock_opt(use_getblock_opt),
++          stats_enabled(streaming_stats_enabled.load(std::memory_order_relaxed)) {
++        form id = form::identity(D);
++        buckets.resize(static_cast<size_t>(l) * (1ULL << k), id);
++
++        if (use_getblock_opt) {
++            getblock_ok = init_getblock_opt_state();
++        }
++    }
++
++    void OnIteration(int type, void* data, uint64_t iteration) override {
++        iteration++;
++        if (iteration > wanted_iter) {
++            return;
++        }
++
++        if (progress_cb != nullptr && progress_interval != 0 && iteration >= next_progress) {
++            progress_cb(next_progress, progress_user_data);
++            next_progress += progress_interval;
++        }
++
++        if (iteration % kl == 0) {
++            uint64_t pos = iteration / kl;
++            if (pos < limit) {
++                form checkpoint;
++                auto started_at = std::chrono::steady_clock::time_point{};
++                if (stats_enabled) {
++                    started_at = std::chrono::steady_clock::now();
++                }
++                SetForm(type, data, &checkpoint);
++                process_checkpoint(pos, checkpoint, /*record_stats=*/true);
++                if (stats_enabled) {
++                    checkpoint_event_total_ns += static_cast<uint64_t>(
++                        std::chrono::duration_cast<std::chrono::nanoseconds>(
++                            std::chrono::steady_clock::now() - started_at)
++                            .count());
++                }
++            }
++        }
++
++        if (iteration == wanted_iter) {
++            SetForm(type, data, &result);
++            has_result = true;
++        }
++    }
++
++    void process_checkpoint(uint64_t i, const form& checkpoint, bool record_stats) {
++        const bool do_stats = stats_enabled && record_stats;
++        auto started_at = std::chrono::steady_clock::time_point{};
++        if (do_stats) {
++            started_at = std::chrono::steady_clock::now();
++        }
++
++        uint64_t local_updates = 0;
++        for (uint32_t j = 0; j < l; j++) {
++            uint64_t p = i * static_cast<uint64_t>(l) + static_cast<uint64_t>(j);
++            uint64_t needed = static_cast<uint64_t>(k) * (p + 1);
++            if (wanted_iter < needed) {
++                break;
++            }
++            uint64_t b = use_getblock_opt ? get_block_opt(p) : get_block(p, k, wanted_iter, B);
++            if (do_stats) {
++                local_updates++;
++            }
++            nucomp_form(bucket(j, b), bucket(j, b), checkpoint, D, L);
++        }
++
++        if (do_stats) {
++            checkpoint_calls++;
++            bucket_updates += local_updates;
++            checkpoint_total_ns += static_cast<uint64_t>(
++                std::chrono::duration_cast<std::chrono::nanoseconds>(
++                    std::chrono::steady_clock::now() - started_at)
++                    .count());
++        }
++    }
++
++    bool init_ok() const { return getblock_ok; }
++
++    bool ok() const { return has_result; }
++
++    const form& y() const { return result; }
++
++    form finalize_proof() {
++        auto started_at = std::chrono::steady_clock::time_point{};
++        if (stats_enabled) {
++            started_at = std::chrono::steady_clock::now();
++        }
++
++        PulmarkReducer reducer;
++        form id = form::identity(D);
++
++        uint64_t k1 = k / 2;
++        uint64_t k0 = k - k1;
++        form x = id;
++
++        for (int64_t j = static_cast<int64_t>(l) - 1; j >= 0; j--) {
++            x = FastPowFormNucomp(x, D, integer(static_cast<uint64_t>(1) << k), L, reducer);
++
++            for (uint64_t b1 = 0; b1 < (1ULL << k1); b1++) {
++                form z = id;
++                for (uint64_t b0 = 0; b0 < (1ULL << k0); b0++) {
++                    nucomp_form(z, z, bucket(static_cast<uint32_t>(j), b1 * (1ULL << k0) + b0), D, L);
++                }
++                z = FastPowFormNucomp(
++                    z,
++                    D,
++                    integer(static_cast<uint64_t>(b1 * (1ULL << k0))),
++                    L,
++                    reducer);
++                nucomp_form(x, x, z, D, L);
++            }
++
++            for (uint64_t b0 = 0; b0 < (1ULL << k0); b0++) {
++                form z = id;
++                for (uint64_t b1 = 0; b1 < (1ULL << k1); b1++) {
++                    nucomp_form(z, z, bucket(static_cast<uint32_t>(j), b1 * (1ULL << k0) + b0), D, L);
++                }
++                z = FastPowFormNucomp(z, D, integer(b0), L, reducer);
++                nucomp_form(x, x, z, D, L);
++            }
++        }
++
++        reducer.reduce(x);
++
++        if (stats_enabled) {
++            finalize_total_ns += static_cast<uint64_t>(
++                std::chrono::duration_cast<std::chrono::nanoseconds>(
++                    std::chrono::steady_clock::now() - started_at)
++                    .count());
++        }
++        return x;
++    }
++
++    bool stats_ok() const { return stats_enabled; }
++
++    LastStreamingStats stats() const {
++        LastStreamingStats out;
++        out.checkpoint_total_ns = checkpoint_total_ns;
++        out.checkpoint_event_total_ns = checkpoint_event_total_ns;
++        out.finalize_total_ns = finalize_total_ns;
++        out.checkpoint_calls = checkpoint_calls;
++        out.bucket_updates = bucket_updates;
++        out.set = stats_enabled;
++        return out;
++    }
++
++  private:
++    form& bucket(uint32_t j, uint64_t b) {
++        size_t idx = static_cast<size_t>(j) * (1ULL << k) + static_cast<size_t>(b);
++        return buckets[idx];
++    }
++
++    const form& bucket(uint32_t j, uint64_t b) const {
++        size_t idx = static_cast<size_t>(j) * (1ULL << k) + static_cast<size_t>(b);
++        return buckets[idx];
++    }
++
++    uint64_t wanted_iter;
++    uint32_t k;
++    uint32_t l;
++    uint64_t kl;
++    uint64_t limit;
++    integer B;
++    uint64_t progress_interval;
++    ChiavdfProgressCallback progress_cb;
++    void* progress_user_data;
++    uint64_t next_progress;
++
++    std::vector<form> buckets;
++    form result;
++    bool has_result = false;
++
++    bool use_getblock_opt;
++    bool getblock_ok = true;
++    uint64_t getblock_next_p = 0;
++    integer getblock_inv_2k;
++    integer getblock_r;
++    integer getblock_tmp;
++
++    bool stats_enabled;
++    uint64_t checkpoint_total_ns = 0;
++    uint64_t checkpoint_event_total_ns = 0;
++    uint64_t finalize_total_ns = 0;
++    uint64_t checkpoint_calls = 0;
++    uint64_t bucket_updates = 0;
++
++    bool init_getblock_opt_state() {
++        if (k == 0) {
++            return false;
++        }
++        uint64_t k_u64 = static_cast<uint64_t>(k);
++        if (wanted_iter < k_u64) {
++            return true;
++        }
++
++        integer two_k_mod = FastPow(2, k_u64, B);
++        if (mpz_invert(getblock_inv_2k.impl, two_k_mod.impl, B.impl) == 0) {
++            return false;
++        }
++
++        getblock_r = FastPow(2, wanted_iter - k_u64, B);
++        getblock_next_p = 0;
++        return true;
++    }
++
++    uint64_t get_block_opt(uint64_t p) {
++        if (!getblock_ok || wanted_iter < static_cast<uint64_t>(k)) {
++            return get_block(p, k, wanted_iter, B);
++        }
++
++        // Expected call pattern is sequential `p`. If we ever get out of sync,
++        // advance state forward or fall back to the slow mapping.
++        if (p < getblock_next_p) {
++            return get_block(p, k, wanted_iter, B);
++        }
++        while (getblock_next_p < p) {
++            mpz_mul(getblock_r.impl, getblock_r.impl, getblock_inv_2k.impl);
++            mpz_mod(getblock_r.impl, getblock_r.impl, B.impl);
++            getblock_next_p++;
++        }
++
++        mpz_mul_2exp(getblock_tmp.impl, getblock_r.impl, k);
++        mpz_fdiv_q(getblock_tmp.impl, getblock_tmp.impl, B.impl);
++        uint64_t b = mpz_get_ui(getblock_tmp.impl);
++
++        mpz_mul(getblock_r.impl, getblock_r.impl, getblock_inv_2k.impl);
++        mpz_mod(getblock_r.impl, getblock_r.impl, B.impl);
++        getblock_next_p++;
++
++        return b;
++    }
++};
++
++ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_impl(
++    const uint8_t* challenge_hash,
++    size_t challenge_size,
++    const uint8_t* x_s,
++    size_t x_s_size,
++    const uint8_t* y_ref_s,
++    size_t y_ref_s_size,
++    size_t discriminant_size_bits,
++    uint64_t num_iterations,
++    uint64_t progress_interval,
++    ChiavdfProgressCallback progress_cb,
++    void* progress_user_data,
++    bool use_getblock_opt) {
++    std::call_once(init_once, init_chiavdf_fast);
++
++    last_streaming_stats = LastStreamingStats{};
++
++    if (challenge_hash == nullptr || challenge_size == 0 || x_s == nullptr || x_s_size == 0 ||
++        y_ref_s == nullptr || y_ref_s_size == 0) {
++        return empty_result();
++    }
++    if (num_iterations == 0) {
++        return empty_result();
++    }
++
++    std::vector<uint8_t> challenge_hash_bytes(challenge_hash, challenge_hash + challenge_size);
++    integer D = CreateDiscriminant(challenge_hash_bytes, static_cast<int>(discriminant_size_bits));
++    integer L = root(-D, 4);
++
++    form x = DeserializeForm(D, x_s, x_s_size);
++    form y_ref = DeserializeForm(D, y_ref_s, y_ref_s_size);
++
++    uint32_t k;
++    uint32_t l;
++    bool tuned = false;
++    const uint64_t budget =
++        bucket_memory_budget_bytes.load(std::memory_order_relaxed);
++    if (num_iterations >= (1 << 16)) {
++        tuned = tune_streaming_parameters(num_iterations, discriminant_size_bits, budget, l, k);
++    }
++    if (!tuned) {
++        if (num_iterations >= (1 << 16)) {
++            ApproximateParameters(num_iterations, l, k);
++        } else {
++            k = 10;
++            l = 1;
++        }
++    }
++    if (k == 0) {
++        k = 1;
++    }
++    if (l == 0) {
++        l = 1;
++    }
++
++    last_streaming_parameters.k = k;
++    last_streaming_parameters.l = l;
++    last_streaming_parameters.tuned = tuned;
++    last_streaming_parameters.set = true;
++
++    uint64_t kl = static_cast<uint64_t>(k) * static_cast<uint64_t>(l);
++    uint64_t limit = num_iterations / kl;
++    if (num_iterations % kl) {
++        limit++;
++    }
++
++    integer B = GetB(D, x, y_ref);
++
++    std::atomic<bool> stopped(false);
++    StreamingOneWesolowskiCallback weso(
++        D,
++        num_iterations,
++        k,
++        l,
++        limit,
++        B,
++        use_getblock_opt,
++        progress_interval,
++        progress_cb,
++        progress_user_data);
++
++    if (!weso.init_ok()) {
++        return empty_result();
++    }
++
++    weso.process_checkpoint(/*i=*/0, x, /*record_stats=*/false);
++
++    FastStorage* fast_storage = nullptr;
++    repeated_square(num_iterations, x, D, L, &weso, fast_storage, stopped);
++
++    if (!weso.ok()) {
++        return empty_result();
++    }
++    if (!(weso.y() == y_ref)) {
++        return empty_result();
++    }
++
++    form proof_form = weso.finalize_proof();
++
++    if (weso.stats_ok()) {
++        last_streaming_stats = weso.stats();
++    }
++
++    int d_bits = D.num_bits();
++    std::vector<unsigned char> y_serialized = SerializeForm(y_ref, d_bits);
++    std::vector<unsigned char> proof_serialized = SerializeForm(proof_form, d_bits);
++
++    if (y_serialized.empty() || proof_serialized.empty()) {
++        return empty_result();
++    }
++
++    const size_t total = y_serialized.size() + proof_serialized.size();
++    uint8_t* out = new uint8_t[total];
++    std::copy(y_serialized.begin(), y_serialized.end(), out);
++    std::copy(proof_serialized.begin(), proof_serialized.end(), out + y_serialized.size());
++    return ChiavdfByteArray{out, total};
++}
++} // namespace
++
++extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast(
++    const uint8_t* challenge_hash,
++    size_t challenge_size,
++    const uint8_t* x_s,
++    size_t x_s_size,
++    size_t discriminant_size_bits,
++    uint64_t num_iterations) {
++    return chiavdf_prove_one_weso_fast_with_progress(
++        challenge_hash,
++        challenge_size,
++        x_s,
++        x_s_size,
++        discriminant_size_bits,
++        num_iterations,
++        /*progress_interval=*/0,
++        /*progress_cb=*/nullptr,
++        /*progress_user_data=*/nullptr);
++}
++
++extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_with_progress(
++    const uint8_t* challenge_hash,
++    size_t challenge_size,
++    const uint8_t* x_s,
++    size_t x_s_size,
++    size_t discriminant_size_bits,
++    uint64_t num_iterations,
++    uint64_t progress_interval,
++    ChiavdfProgressCallback progress_cb,
++    void* progress_user_data) {
++    try {
++        std::call_once(init_once, init_chiavdf_fast);
++
++        if (challenge_hash == nullptr || challenge_size == 0 || x_s == nullptr || x_s_size == 0) {
++            return empty_result();
++        }
++        if (num_iterations == 0) {
++            return empty_result();
++        }
++
++        std::vector<uint8_t> challenge_hash_bytes(challenge_hash, challenge_hash + challenge_size);
++        integer D = CreateDiscriminant(challenge_hash_bytes, static_cast<int>(discriminant_size_bits));
++        integer L = root(-D, 4);
++
++        form x = DeserializeForm(D, x_s, x_s_size);
++
++        std::atomic<bool> stopped(false);
++        ProgressOneWesolowskiCallback weso(
++            D,
++            x,
++            num_iterations,
++            progress_interval,
++            progress_cb,
++            progress_user_data);
++
++        // Run the fast repeated-squaring engine to `num_iterations`.
++        // The callback stores all intermediates needed for the proof.
++        FastStorage* fast_storage = nullptr;
++        repeated_square(num_iterations, x, D, L, &weso, fast_storage, stopped);
++
++        // Now generate the compact proof from the stored intermediates.
++        Proof proof = ProveOneWesolowski(num_iterations, D, x, &weso, stopped);
++        if (proof.y.empty() || proof.proof.empty()) {
++            return empty_result();
++        }
++
++        const size_t total = proof.y.size() + proof.proof.size();
++        uint8_t* out = new uint8_t[total];
++        std::copy(proof.y.begin(), proof.y.end(), out);
++        std::copy(proof.proof.begin(), proof.proof.end(), out + proof.y.size());
++        return ChiavdfByteArray{out, total};
++    } catch (...) {
++        return empty_result();
++    }
++}
++
++extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming(
++    const uint8_t* challenge_hash,
++    size_t challenge_size,
++    const uint8_t* x_s,
++    size_t x_s_size,
++    const uint8_t* y_ref_s,
++    size_t y_ref_s_size,
++    size_t discriminant_size_bits,
++    uint64_t num_iterations) {
++    return chiavdf_prove_one_weso_fast_streaming_with_progress(
++        challenge_hash,
++        challenge_size,
++        x_s,
++        x_s_size,
++        y_ref_s,
++        y_ref_s_size,
++        discriminant_size_bits,
++        num_iterations,
++        /*progress_interval=*/0,
++        /*progress_cb=*/nullptr,
++        /*progress_user_data=*/nullptr);
++}
++
++extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_with_progress(
++    const uint8_t* challenge_hash,
++    size_t challenge_size,
++    const uint8_t* x_s,
++    size_t x_s_size,
++    const uint8_t* y_ref_s,
++    size_t y_ref_s_size,
++    size_t discriminant_size_bits,
++    uint64_t num_iterations,
++    uint64_t progress_interval,
++    ChiavdfProgressCallback progress_cb,
++    void* progress_user_data) {
++    try {
++        return chiavdf_prove_one_weso_fast_streaming_impl(
++            challenge_hash,
++            challenge_size,
++            x_s,
++            x_s_size,
++            y_ref_s,
++            y_ref_s_size,
++            discriminant_size_bits,
++            num_iterations,
++            progress_interval,
++            progress_cb,
++            progress_user_data,
++            /*use_getblock_opt=*/false);
++    } catch (...) {
++        return empty_result();
++    }
++}
++
++extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt(
++    const uint8_t* challenge_hash,
++    size_t challenge_size,
++    const uint8_t* x_s,
++    size_t x_s_size,
++    const uint8_t* y_ref_s,
++    size_t y_ref_s_size,
++    size_t discriminant_size_bits,
++    uint64_t num_iterations) {
++    return chiavdf_prove_one_weso_fast_streaming_getblock_opt_with_progress(
++        challenge_hash,
++        challenge_size,
++        x_s,
++        x_s_size,
++        y_ref_s,
++        y_ref_s_size,
++        discriminant_size_bits,
++        num_iterations,
++        /*progress_interval=*/0,
++        /*progress_cb=*/nullptr,
++        /*progress_user_data=*/nullptr);
++}
++
++extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt_with_progress(
++    const uint8_t* challenge_hash,
++    size_t challenge_size,
++    const uint8_t* x_s,
++    size_t x_s_size,
++    const uint8_t* y_ref_s,
++    size_t y_ref_s_size,
++    size_t discriminant_size_bits,
++    uint64_t num_iterations,
++    uint64_t progress_interval,
++    ChiavdfProgressCallback progress_cb,
++    void* progress_user_data) {
++    try {
++        return chiavdf_prove_one_weso_fast_streaming_impl(
++            challenge_hash,
++            challenge_size,
++            x_s,
++            x_s_size,
++            y_ref_s,
++            y_ref_s_size,
++            discriminant_size_bits,
++            num_iterations,
++            progress_interval,
++            progress_cb,
++            progress_user_data,
++            /*use_getblock_opt=*/true);
++    } catch (...) {
++        return empty_result();
++    }
++}
++
++extern "C" void chiavdf_set_bucket_memory_budget_bytes(uint64_t bytes) {
++    bucket_memory_budget_bytes.store(bytes, std::memory_order_relaxed);
++}
++
++extern "C" void chiavdf_set_enable_streaming_stats(bool enable) {
++    streaming_stats_enabled.store(enable, std::memory_order_relaxed);
++    last_streaming_stats = LastStreamingStats{};
++}
++
++extern "C" bool chiavdf_get_last_streaming_parameters(uint32_t* out_k, uint32_t* out_l, bool* out_tuned) {
++    if (out_k == nullptr || out_l == nullptr || out_tuned == nullptr) {
++        return false;
++    }
++    if (!last_streaming_parameters.set) {
++        return false;
++    }
++    *out_k = last_streaming_parameters.k;
++    *out_l = last_streaming_parameters.l;
++    *out_tuned = last_streaming_parameters.tuned;
++    return true;
++}
++
++extern "C" bool chiavdf_get_last_streaming_stats(
++    uint64_t* out_checkpoint_total_ns,
++    uint64_t* out_checkpoint_event_total_ns,
++    uint64_t* out_finalize_total_ns,
++    uint64_t* out_checkpoint_calls,
++    uint64_t* out_bucket_updates) {
++    if (out_checkpoint_total_ns == nullptr || out_checkpoint_event_total_ns == nullptr ||
++        out_finalize_total_ns == nullptr || out_checkpoint_calls == nullptr ||
++        out_bucket_updates == nullptr) {
++        return false;
++    }
++    if (!last_streaming_stats.set) {
++        return false;
++    }
++    *out_checkpoint_total_ns = last_streaming_stats.checkpoint_total_ns;
++    *out_checkpoint_event_total_ns = last_streaming_stats.checkpoint_event_total_ns;
++    *out_finalize_total_ns = last_streaming_stats.finalize_total_ns;
++    *out_checkpoint_calls = last_streaming_stats.checkpoint_calls;
++    *out_bucket_updates = last_streaming_stats.bucket_updates;
++    return true;
++}
++
++extern "C" void chiavdf_free_byte_array(ChiavdfByteArray array) { delete[] array.data; }
+diff --git a/src/c_bindings/fast_wrapper.h b/src/c_bindings/fast_wrapper.h
+new file mode 100644
+index 0000000..bf33f32
+--- /dev/null
++++ b/src/c_bindings/fast_wrapper.h
+@@ -0,0 +1,145 @@
++#pragma once
++
++#include <stdbool.h>
++#include <stddef.h>
++#include <stdint.h>
++
++#ifdef __cplusplus
++extern "C" {
++#endif
++
++typedef struct {
++    uint8_t* data;
++    size_t length;
++} ChiavdfByteArray;
++
++typedef void (*ChiavdfProgressCallback)(uint64_t iters_done, void* user_data);
++
++// Configure the per-process memory budget used by the parameter tuner when
++// selecting `(k,l)` for streaming/bucket-based proving.
++//
++// The budget is per worker process (not global across multiple processes).
++//
++// If `bytes` is 0, the default chiavdf heuristic is used.
++void chiavdf_set_bucket_memory_budget_bytes(uint64_t bytes);
++
++// Debug helper: returns the `(k,l)` parameters selected for the most recent
++// streaming proof computed on the current thread.
++//
++// Returns true if parameters are available.
++bool chiavdf_get_last_streaming_parameters(uint32_t* out_k, uint32_t* out_l, bool* out_tuned);
++
++// Enable lightweight timing counters for the streaming prover.
++//
++// When enabled, the native library records basic timing counters for the most
++// recent streaming proof computed on the current thread. This is intended for
++// benchmarking and tuning; production runs should keep this disabled to avoid
++// extra overhead.
++void chiavdf_set_enable_streaming_stats(bool enable);
++
++// Debug helper: returns timing counters for the most recent streaming proof on
++// the current thread.
++//
++// Returns true if stats are available (i.e. stats enabled and a streaming proof
++// was computed successfully).
++bool chiavdf_get_last_streaming_stats(
++    uint64_t* out_checkpoint_total_ns,
++    uint64_t* out_checkpoint_event_total_ns,
++    uint64_t* out_finalize_total_ns,
++    uint64_t* out_checkpoint_calls,
++    uint64_t* out_bucket_updates);
++
++// Computes a compact (witness_type=0) Wesolowski proof using the fast engine.
++//
++// On success, returns `y || proof` where:
++// - `y` is the serialized output form (typically 100 bytes for 1024-bit discriminants)
++// - `proof` is the serialized witness form (same size as `y`)
++//
++// On failure, returns `{NULL, 0}`.
++ChiavdfByteArray chiavdf_prove_one_weso_fast(
++    const uint8_t* challenge_hash,
++    size_t challenge_size,
++    const uint8_t* x_s,
++    size_t x_s_size,
++    size_t discriminant_size_bits,
++    uint64_t num_iterations);
++
++// Same as `chiavdf_prove_one_weso_fast`, but optionally invokes `progress_cb` from
++// the proving thread every `progress_interval` iterations completed.
++//
++// If `progress_cb` is NULL or `progress_interval` is 0, no progress is reported.
++ChiavdfByteArray chiavdf_prove_one_weso_fast_with_progress(
++    const uint8_t* challenge_hash,
++    size_t challenge_size,
++    const uint8_t* x_s,
++    size_t x_s_size,
++    size_t discriminant_size_bits,
++    uint64_t num_iterations,
++    uint64_t progress_interval,
++    ChiavdfProgressCallback progress_cb,
++    void* progress_user_data);
++
++// Computes a compact (witness_type=0) Wesolowski proof using the "streaming"
++// bucket-accumulation algorithm (Trick 1), which requires the expected output
++// `y_ref` up front (as used by bluebox compaction jobs).
++//
++// On success, returns `y || proof` (same format as `chiavdf_prove_one_weso_fast`).
++ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming(
++    const uint8_t* challenge_hash,
++    size_t challenge_size,
++    const uint8_t* x_s,
++    size_t x_s_size,
++    const uint8_t* y_ref_s,
++    size_t y_ref_s_size,
++    size_t discriminant_size_bits,
++    uint64_t num_iterations);
++
++// Same as `chiavdf_prove_one_weso_fast_streaming`, but optionally invokes
++// `progress_cb` from the proving thread every `progress_interval` iterations.
++ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_with_progress(
++    const uint8_t* challenge_hash,
++    size_t challenge_size,
++    const uint8_t* x_s,
++    size_t x_s_size,
++    const uint8_t* y_ref_s,
++    size_t y_ref_s_size,
++    size_t discriminant_size_bits,
++    uint64_t num_iterations,
++    uint64_t progress_interval,
++    ChiavdfProgressCallback progress_cb,
++    void* progress_user_data);
++
++// Same as `chiavdf_prove_one_weso_fast_streaming`, but with an optimized
++// implementation of the `GetBlock()` mapping (avoids per-block modular
++// exponentiation without allocating a full `GetBlock` table).
++ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt(
++    const uint8_t* challenge_hash,
++    size_t challenge_size,
++    const uint8_t* x_s,
++    size_t x_s_size,
++    const uint8_t* y_ref_s,
++    size_t y_ref_s_size,
++    size_t discriminant_size_bits,
++    uint64_t num_iterations);
++
++// Same as `chiavdf_prove_one_weso_fast_streaming_getblock_opt`, but optionally
++// invokes `progress_cb` from the proving thread every `progress_interval`
++// iterations.
++ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt_with_progress(
++    const uint8_t* challenge_hash,
++    size_t challenge_size,
++    const uint8_t* x_s,
++    size_t x_s_size,
++    const uint8_t* y_ref_s,
++    size_t y_ref_s_size,
++    size_t discriminant_size_bits,
++    uint64_t num_iterations,
++    uint64_t progress_interval,
++    ChiavdfProgressCallback progress_cb,
++    void* progress_user_data);
++
++void chiavdf_free_byte_array(ChiavdfByteArray array);
++
++#ifdef __cplusplus
++}
++#endif
+diff --git a/src/threading.h b/src/threading.h
+index 50d4b49..f6344ad 100644
+--- a/src/threading.h
++++ b/src/threading.h
+@@ -564,8 +564,8 @@ struct alignas(64) thread_counter {
+     }
+ };
+ 
+-thread_counter master_counter[100];
+-thread_counter slave_counter[100];
++thread_counter master_counter[512];
++thread_counter slave_counter[512];
+ 
+ struct thread_state {
+     int pairindex;
+diff --git a/src/vdf.h b/src/vdf.h
+index 9ab4aef..4544fe2 100644
+--- a/src/vdf.h
++++ b/src/vdf.h
+@@ -78,6 +78,18 @@ std::mutex new_event_mutex, cout_lock;
+ bool debug_mode = false;
+ bool fast_algorithm = false;
+ bool two_weso = false;
++bool quiet_mode = false;
++
++// vdf_fast uses shared master/slave counters keyed by `square_state.pairindex`.
++// The upstream chiavdf binaries run one VDF per process and hardcode `pairindex=0`.
++// In embedded/multi-worker setups (like WesoForge), multiple VDF computations can
++// run concurrently in the same process; they must not share a pairindex.
++inline int vdf_fast_pairindex() {
++    constexpr int kSlots = int(sizeof(master_counter) / sizeof(master_counter[0]));
++    static std::atomic<int> next_slot{0};
++    thread_local int slot = next_slot.fetch_add(1, std::memory_order_relaxed) % kSlots;
++    return slot;
++}
+ 
+ //always works
+ void repeated_square_original(vdf_original &vdfo, form& f, const integer& D, const integer& L, uint64 base, uint64 iterations, INUDUPLListener *nuduplListener) {
+@@ -137,7 +149,7 @@ void repeated_square(uint64_t iterations, form f, const integer& D, const intege
+ 
+         // This works single threaded
+         square_state_type square_state;
+-        square_state.pairindex=0;
++        square_state.pairindex=vdf_fast_pairindex();
+ 
+         uint64 actual_iterations=repeated_square_fast(square_state, f, D, L, num_iterations, batch_size, weso);
+ 
+@@ -236,10 +248,12 @@ void repeated_square(uint64_t iterations, form f, const integer& D, const intege
+             }
+         #endif
+     }
+-    {
+-        // this shouldn't be needed but avoids some false positive in TSAN
+-        std::lock_guard<std::mutex> lk(cout_lock);
+-        std::cout << "VDF loop finished. Total iters: " << num_iterations << "\n" << std::flush;
++    if (!quiet_mode) {
++        {
++            // this shouldn't be needed but avoids some false positive in TSAN
++            std::lock_guard<std::mutex> lk(cout_lock);
++            std::cout << "VDF loop finished. Total iters: " << num_iterations << "\n" << std::flush;
++        }
+     }
+ 
+     #ifdef VDF_TEST
+@@ -275,11 +289,6 @@ Proof ProveOneWesolowski(uint64_t iters, integer& D, form f, OneWesolowskiCallba
+     proof_serialized = SerializeForm(proof_form, d_bits);
+     Proof proof(y_serialized, proof_serialized);
+     proof.witness_type = 0;
+-    {
+-        // this shouldn't be needed but avoids some false positive in TSAN
+-        std::lock_guard<std::mutex> lk(cout_lock);
+-        std::cout << "Got simple weso proof: " << proof.hex() << "\n";
+-    }
+     return proof;
+ }
+ 
+diff --git a/docs/bluebox_compaction.md b/docs/bluebox_compaction.md
+new file mode 100644
+index 0000000..61cd1fd
+--- /dev/null
++++ b/docs/bluebox_compaction.md
+@@ -0,0 +1,49 @@
++# Bluebox Compaction Optimizations
++
++This document describes the compaction-oriented proving path exposed by
++`src/c_bindings/fast_wrapper.h` and implemented in
++`src/c_bindings/fast_wrapper.cpp`.
++
++## Scope
++
++These APIs are intended for workloads where the expected VDF output (`y_ref`) is
++already known up front (for example, bluebox compaction jobs). They are additive
++and do not change the existing `c_wrapper` APIs.
++
++## Optimization 1: Streaming one-wesolowski
++
++Given `y_ref`, the prover computes:
++
++- `B = GetB(D, x, y_ref)` before squaring starts
++
++This enables a streaming algorithm that updates proof buckets at each
++checkpoint during repeated squaring, instead of materializing the full
++intermediate checkpoint array and scanning it after the loop. In practice this
++substantially reduces memory usage for compaction workloads.
++
++## Optimization 2: Incremental GetBlock mapping
++
++For streaming checkpoint updates, bucket index selection repeatedly calls
++`GetBlock(p, k, T, B)`. The optimized mode keeps a rolling modular state and
++advances sequential `p` values incrementally, avoiding full modular
++exponentiation per call and avoiding a large lookup table.
++
++## Optimization 3: Memory-budgeted (k, l) tuning
++
++The wrapper can tune `(k, l)` under a configured memory budget:
++
++- `chiavdf_set_bucket_memory_budget_bytes(...)`
++
++If no tuned candidate is found, the code falls back to the standard parameter
++heuristics.
++
++## Operational Notes
++
++- The `fast_wrapper` code path sets one-wesolowski mode and uses `quiet_mode` to
++  avoid unsolicited stdout noise when embedded in multi-worker clients.
++- Thread-slot assignment for the fast VDF counters is per-thread via
++  `vdf_fast_pairindex()`, avoiding slot collisions when multiple VDF computations
++  run in one process.
++- The production default for `enable_threads` in `parameters.h` is unchanged from
++  upstream to preserve timelord expectations.
++
diff --git a/src/c_bindings/fast_wrapper.cpp b/src/c_bindings/fast_wrapper.cpp
index 61d24599..d660ee80 100644
--- a/src/c_bindings/fast_wrapper.cpp
+++ b/src/c_bindings/fast_wrapper.cpp
@@ -1,7 +1,9 @@
 #include "fast_wrapper.h"
 
 #include <atomic>
+#include <cassert>
 #include <chrono>
+#include <cstdio>
 #include <limits>
 #include <mutex>
 #include <vector>
@@ -63,6 +65,39 @@ void init_chiavdf_fast() {
 
 ChiavdfByteArray empty_result() { return ChiavdfByteArray{nullptr, 0}; }
 
+uint64_t saturating_add_u64(uint64_t lhs, uint64_t rhs) {
+    if (lhs > std::numeric_limits<uint64_t>::max() - rhs) {
+        return std::numeric_limits<uint64_t>::max();
+    }
+    return lhs + rhs;
+}
+
+void free_byte_array_batch_internal(ChiavdfByteArray* arrays, size_t count) {
+    if (arrays == nullptr) {
+        return;
+    }
+    for (size_t idx = 0; idx < count; ++idx) {
+        delete[] arrays[idx].data;
+        arrays[idx].data = nullptr;
+        arrays[idx].length = 0;
+    }
+    delete[] arrays;
+}
+
+struct BatchProgressContext {
+    uint64_t completed_before = 0;
+    ChiavdfProgressCallback progress_cb = nullptr;
+    void* progress_user_data = nullptr;
+};
+
+void batch_progress_trampoline(uint64_t iters_done, void* user_data) {
+    auto* ctx = static_cast<BatchProgressContext*>(user_data);
+    if (ctx == nullptr || ctx->progress_cb == nullptr) {
+        return;
+    }
+    ctx->progress_cb(saturating_add_u64(ctx->completed_before, iters_done), ctx->progress_user_data);
+}
+
 uint64_t estimate_bucket_form_bytes(size_t discriminant_size_bits) {
     // Be conservative: class group forms contain 3 GMP-backed integers that
     // quickly grow to the discriminant size (or beyond) during NUCOMP.
@@ -96,6 +131,13 @@ bool tune_streaming_parameters(
 
     unsigned __int128 best_cost = std::numeric_limits<unsigned __int128>::max();
     bool found = false;
+#ifndef NDEBUG
+    uint32_t best_k = 0;
+    uint32_t best_l = 0;
+    unsigned __int128 best_updates = 0;
+    unsigned __int128 best_checkpoints = 0;
+    unsigned __int128 best_fold = 0;
+#endif
 
     // Empirical tuning notes (1024-bit discriminants, AVX2 build):
     // - Each bucket update (NUCOMP) and each fold unit is ~5µs.
@@ -133,10 +175,42 @@ bool tune_streaming_parameters(
                 best_cost = cost;
                 out_k = k;
                 out_l = l;
+#ifndef NDEBUG
+                best_k = k;
+                best_l = l;
+                best_updates = updates;
+                best_checkpoints = checkpoints;
+                best_fold = fold;
+#endif
             }
         }
     }
 
+#ifndef NDEBUG
+    if (found) {
+        assert(best_k >= 4 && best_k <= 20);
+        assert(best_l >= 1 && best_l <= 64);
+        std::fprintf(
+            stderr,
+            "[chiavdf] tune_streaming_parameters: T=%llu, budget=%llu, selected=(k=%u,l=%u), "
+            "components{updates=%llu, checkpoints=%llu, fold=%llu}, weights{u=16,c=1,f=16}\n",
+            static_cast<unsigned long long>(num_iterations),
+            static_cast<unsigned long long>(memory_budget_bytes),
+            best_k,
+            best_l,
+            static_cast<unsigned long long>(best_updates),
+            static_cast<unsigned long long>(best_checkpoints),
+            static_cast<unsigned long long>(best_fold));
+        if (best_k == 20 && num_iterations < (1ULL << 24)) {
+            std::fprintf(
+                stderr,
+                "[chiavdf] tune_streaming_parameters: high-k selection for moderate T "
+                "(k=20, T=%llu); verify measured update/fold timing assumptions.\n",
+                static_cast<unsigned long long>(num_iterations));
+        }
+    }
+#endif
+
     return found;
 }
 
@@ -331,6 +405,7 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
         }
     }
 
+  public:
     bool init_ok() const { return getblock_ok; }
 
     bool ok() const { return has_result; }
@@ -401,6 +476,7 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
         return out;
     }
 
+  private:
     form& bucket(uint32_t j, uint64_t b) {
         size_t idx = static_cast<size_t>(j) * (1ULL << k) + static_cast<size_t>(b);
         return buckets[idx];
@@ -836,4 +912,96 @@ extern "C" bool chiavdf_get_last_streaming_stats(
     return true;
 }
 
+extern "C" ChiavdfByteArray* chiavdf_prove_one_weso_fast_streaming_getblock_opt_batch_with_progress(
+    const uint8_t* challenge_hash,
+    size_t challenge_size,
+    const uint8_t* x_s,
+    size_t x_s_size,
+    size_t discriminant_size_bits,
+    const ChiavdfBatchJob* jobs,
+    size_t job_count,
+    uint64_t progress_interval,
+    ChiavdfProgressCallback progress_cb,
+    void* progress_user_data) {
+    if (challenge_hash == nullptr || challenge_size == 0 || x_s == nullptr || x_s_size == 0) {
+        return nullptr;
+    }
+    if (discriminant_size_bits == 0 || jobs == nullptr || job_count == 0) {
+        return nullptr;
+    }
+
+    ChiavdfByteArray* out_arrays = nullptr;
+    try {
+        out_arrays = new ChiavdfByteArray[job_count];
+        for (size_t idx = 0; idx < job_count; ++idx) {
+            out_arrays[idx] = empty_result();
+        }
+
+        uint64_t completed_iters = 0;
+        for (size_t idx = 0; idx < job_count; ++idx) {
+            const ChiavdfBatchJob& job = jobs[idx];
+            if (job.y_ref_s == nullptr || job.y_ref_s_size == 0 || job.num_iterations == 0) {
+                free_byte_array_batch_internal(out_arrays, job_count);
+                return nullptr;
+            }
+
+            BatchProgressContext progress_ctx;
+            progress_ctx.completed_before = completed_iters;
+            progress_ctx.progress_cb = progress_cb;
+            progress_ctx.progress_user_data = progress_user_data;
+            const bool use_progress = progress_cb != nullptr && progress_interval != 0;
+
+            out_arrays[idx] = chiavdf_prove_one_weso_fast_streaming_getblock_opt_with_progress(
+                challenge_hash,
+                challenge_size,
+                x_s,
+                x_s_size,
+                job.y_ref_s,
+                job.y_ref_s_size,
+                discriminant_size_bits,
+                job.num_iterations,
+                progress_interval,
+                use_progress ? batch_progress_trampoline : nullptr,
+                use_progress ? static_cast<void*>(&progress_ctx) : nullptr);
+
+            if (out_arrays[idx].data == nullptr || out_arrays[idx].length == 0) {
+                free_byte_array_batch_internal(out_arrays, job_count);
+                return nullptr;
+            }
+
+            completed_iters = saturating_add_u64(completed_iters, job.num_iterations);
+        }
+
+        return out_arrays;
+    } catch (...) {
+        free_byte_array_batch_internal(out_arrays, job_count);
+        return nullptr;
+    }
+}
+
+extern "C" ChiavdfByteArray* chiavdf_prove_one_weso_fast_streaming_getblock_opt_batch(
+    const uint8_t* challenge_hash,
+    size_t challenge_size,
+    const uint8_t* x_s,
+    size_t x_s_size,
+    size_t discriminant_size_bits,
+    const ChiavdfBatchJob* jobs,
+    size_t job_count) {
+    return chiavdf_prove_one_weso_fast_streaming_getblock_opt_batch_with_progress(
+        challenge_hash,
+        challenge_size,
+        x_s,
+        x_s_size,
+        discriminant_size_bits,
+        jobs,
+        job_count,
+        /*progress_interval=*/0,
+        /*progress_cb=*/nullptr,
+        /*progress_user_data=*/nullptr);
+}
+
+extern "C" void chiavdf_free_byte_array_batch(ChiavdfByteArray* arrays, size_t count) {
+    free_byte_array_batch_internal(arrays, count);
+}
+
 extern "C" void chiavdf_free_byte_array(ChiavdfByteArray array) { delete[] array.data; }
diff --git a/src/c_bindings/fast_wrapper.h b/src/c_bindings/fast_wrapper.h
index bf33f320..115c3abd 100644
--- a/src/c_bindings/fast_wrapper.h
+++ b/src/c_bindings/fast_wrapper.h
@@ -13,6 +13,12 @@ typedef struct {
     size_t length;
 } ChiavdfByteArray;
 
+typedef struct {
+    const uint8_t* y_ref_s;
+    size_t y_ref_s_size;
+    uint64_t num_iterations;
+} ChiavdfBatchJob;
+
 typedef void (*ChiavdfProgressCallback)(uint64_t iters_done, void* user_data);
 
 // Configure the per-process memory budget used by the parameter tuner when
@@ -138,6 +144,32 @@ ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt_with_progres
     ChiavdfProgressCallback progress_cb,
     void* progress_user_data);
 
+// Batch variant: computes one proof per `jobs[i]` using a shared API surface.
+// Returns an array of `job_count` results on success; caller owns/frees it.
+ChiavdfByteArray* chiavdf_prove_one_weso_fast_streaming_getblock_opt_batch(
+    const uint8_t* challenge_hash,
+    size_t challenge_size,
+    const uint8_t* x_s,
+    size_t x_s_size,
+    size_t discriminant_size_bits,
+    const ChiavdfBatchJob* jobs,
+    size_t job_count);
+
+// Same as batch API above, with optional aggregate progress callback.
+ChiavdfByteArray* chiavdf_prove_one_weso_fast_streaming_getblock_opt_batch_with_progress(
+    const uint8_t* challenge_hash,
+    size_t challenge_size,
+    const uint8_t* x_s,
+    size_t x_s_size,
+    size_t discriminant_size_bits,
+    const ChiavdfBatchJob* jobs,
+    size_t job_count,
+    uint64_t progress_interval,
+    ChiavdfProgressCallback progress_cb,
+    void* progress_user_data);
+
+void chiavdf_free_byte_array_batch(ChiavdfByteArray* arrays, size_t count);
+
 void chiavdf_free_byte_array(ChiavdfByteArray array);
 
 #ifdef __cplusplus

From 95f8ff18d3adcfd291767fbc35273a202d6c645d Mon Sep 17 00:00:00 2001
From: Gene Hoffman <hoffmang@hoffmang.com>
Date: Tue, 24 Feb 2026 00:51:52 -0800
Subject: [PATCH 07/13] Make fast-thread counter slots build-configurable.

Default CHIA_VDF_FAST_COUNTER_SLOTS to 100 in threading.h so upstream builds keep lower BSS usage while allowing embedded deployments to override via compiler defines.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 src/threading.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/threading.h b/src/threading.h
index 8354d824..dbb18592 100644
--- a/src/threading.h
+++ b/src/threading.h
@@ -566,8 +566,12 @@ struct alignas(64) thread_counter {
     }
 };
 
-thread_counter master_counter[512];
-thread_counter slave_counter[512];
+#ifndef CHIA_VDF_FAST_COUNTER_SLOTS
+#define CHIA_VDF_FAST_COUNTER_SLOTS 100
+#endif
+
+thread_counter master_counter[CHIA_VDF_FAST_COUNTER_SLOTS];
+thread_counter slave_counter[CHIA_VDF_FAST_COUNTER_SLOTS];
 
 struct thread_state {
     int pairindex;

From 746ba2e8edadadb0e81951c4bf93aaf8a5a3dbe0 Mon Sep 17 00:00:00 2001
From: Gene Hoffman <hoffmang@hoffmang.com>
Date: Tue, 24 Feb 2026 00:55:50 -0800
Subject: [PATCH 08/13] Fix fast pairindex allocator state across translation
 units.

Use one program-wide atomic slot allocator for `vdf_fast_pairindex()` so concurrent VDF computations started from different translation units cannot collide on shared fast counter slots.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 src/vdf.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/vdf.h b/src/vdf.h
index 8ca75d8c..575cc78a 100644
--- a/src/vdf.h
+++ b/src/vdf.h
@@ -93,11 +93,16 @@ bool quiet_mode = false;
 // The upstream chiavdf binaries run one VDF per process and hardcode `pairindex=0`.
 // In embedded/multi-worker setups (like WesoForge), multiple VDF computations can
 // run concurrently in the same process; they must not share a pairindex.
+#if (defined(ARCH_X86) || defined(ARCH_X64)) && !defined(CHIA_DISABLE_ASM)
+// Keep slot allocation state as one program-wide entity for all TUs that include
+// this header, so concurrent callers cannot recycle the same slot sequence.
+inline std::atomic<unsigned int> vdf_fast_next_slot{0};
+#endif
+
 inline int vdf_fast_pairindex() {
 #if (defined(ARCH_X86) || defined(ARCH_X64)) && !defined(CHIA_DISABLE_ASM)
     constexpr unsigned int kSlots = unsigned(sizeof(master_counter) / sizeof(master_counter[0]));
-    static std::atomic<unsigned int> next_slot{0};
-    thread_local int slot = int(next_slot.fetch_add(1u, std::memory_order_relaxed) % kSlots);
+    thread_local int slot = int(vdf_fast_next_slot.fetch_add(1u, std::memory_order_relaxed) % kSlots);
     return slot;
 #else
     return 0;

From 707b2f46e3499b2ba65547693401ceb7a4f069a3 Mon Sep 17 00:00:00 2001
From: Gene Hoffman <hoffmang@hoffmang.com>
Date: Tue, 24 Feb 2026 00:58:53 -0800
Subject: [PATCH 09/13] Guard streaming prover bucket shifts against invalid k.

Reject k>=64 before any 64-bit left-shift and reuse validated bucket spans for allocation, indexing, and finalization loops so invalid parameter tuning cannot trigger undefined behavior.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 src/c_bindings/fast_wrapper.cpp | 53 ++++++++++++++++++++++++++-------
 1 file changed, 42 insertions(+), 11 deletions(-)

diff --git a/src/c_bindings/fast_wrapper.cpp b/src/c_bindings/fast_wrapper.cpp
index d660ee80..af3bf805 100644
--- a/src/c_bindings/fast_wrapper.cpp
+++ b/src/c_bindings/fast_wrapper.cpp
@@ -72,6 +72,14 @@ uint64_t saturating_add_u64(uint64_t lhs, uint64_t rhs) {
     return lhs + rhs;
 }
 
+bool try_pow2_u64_shift(uint32_t shift, uint64_t& out) {
+    if (shift >= 64) {
+        return false;
+    }
+    out = 1ULL << shift;
+    return true;
+}
+
 void free_byte_array_batch_internal(ChiavdfByteArray* arrays, size_t count) {
     if (arrays == nullptr) {
         return;
@@ -289,7 +297,19 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
           use_getblock_opt(use_getblock_opt),
           stats_enabled(streaming_stats_enabled.load(std::memory_order_relaxed)) {
         form id = form::identity(D);
-        buckets.resize(static_cast<size_t>(l) * (1ULL << k), id);
+        uint64_t bucket_span_u64 = 0;
+        if (!try_pow2_u64_shift(k, bucket_span_u64)) {
+            getblock_ok = false;
+            return;
+        }
+
+        bucket_span = static_cast<size_t>(bucket_span_u64);
+        if (bucket_span != 0 && static_cast<size_t>(l) > std::numeric_limits<size_t>::max() / bucket_span) {
+            getblock_ok = false;
+            return;
+        }
+
+        buckets.resize(static_cast<size_t>(l) * bucket_span, id);
 
         if (use_getblock_opt) {
             getblock_ok = init_getblock_opt_state();
@@ -423,29 +443,35 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
 
         uint64_t k1 = k / 2;
         uint64_t k0 = k - k1;
+        uint64_t span_k0 = 0;
+        uint64_t span_k1 = 0;
+        if (!try_pow2_u64_shift(static_cast<uint32_t>(k0), span_k0) ||
+            !try_pow2_u64_shift(static_cast<uint32_t>(k1), span_k1)) {
+            return form::identity(D);
+        }
         form x = id;
 
         for (int64_t j = static_cast<int64_t>(l) - 1; j >= 0; j--) {
-            x = FastPowFormNucomp(x, D, integer(static_cast<uint64_t>(1) << k), L, reducer);
+            x = FastPowFormNucomp(x, D, integer(static_cast<uint64_t>(bucket_span)), L, reducer);
 
-            for (uint64_t b1 = 0; b1 < (1ULL << k1); b1++) {
+            for (uint64_t b1 = 0; b1 < span_k1; b1++) {
                 form z = id;
-                for (uint64_t b0 = 0; b0 < (1ULL << k0); b0++) {
-                    nucomp_form(z, z, bucket(static_cast<uint32_t>(j), b1 * (1ULL << k0) + b0), D, L);
+                for (uint64_t b0 = 0; b0 < span_k0; b0++) {
+                    nucomp_form(z, z, bucket(static_cast<uint32_t>(j), b1 * span_k0 + b0), D, L);
                 }
                 z = FastPowFormNucomp(
                     z,
                     D,
-                    integer(static_cast<uint64_t>(b1 * (1ULL << k0))),
+                    integer(static_cast<uint64_t>(b1 * span_k0)),
                     L,
                     reducer);
                 nucomp_form(x, x, z, D, L);
             }
 
-            for (uint64_t b0 = 0; b0 < (1ULL << k0); b0++) {
+            for (uint64_t b0 = 0; b0 < span_k0; b0++) {
                 form z = id;
-                for (uint64_t b1 = 0; b1 < (1ULL << k1); b1++) {
-                    nucomp_form(z, z, bucket(static_cast<uint32_t>(j), b1 * (1ULL << k0) + b0), D, L);
+                for (uint64_t b1 = 0; b1 < span_k1; b1++) {
+                    nucomp_form(z, z, bucket(static_cast<uint32_t>(j), b1 * span_k0 + b0), D, L);
                 }
                 z = FastPowFormNucomp(z, D, integer(b0), L, reducer);
                 nucomp_form(x, x, z, D, L);
@@ -478,12 +504,12 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
 
   private:
     form& bucket(uint32_t j, uint64_t b) {
-        size_t idx = static_cast<size_t>(j) * (1ULL << k) + static_cast<size_t>(b);
+        size_t idx = static_cast<size_t>(j) * bucket_span + static_cast<size_t>(b);
         return buckets[idx];
     }
 
     const form& bucket(uint32_t j, uint64_t b) const {
-        size_t idx = static_cast<size_t>(j) * (1ULL << k) + static_cast<size_t>(b);
+        size_t idx = static_cast<size_t>(j) * bucket_span + static_cast<size_t>(b);
         return buckets[idx];
     }
 
@@ -497,6 +523,7 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
     ChiavdfProgressCallback progress_cb;
     void* progress_user_data;
     uint64_t next_progress;
+    size_t bucket_span = 0;
 
     std::vector<form> buckets;
     form result;
@@ -620,6 +647,10 @@ ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_impl(
     if (l == 0) {
         l = 1;
     }
+    uint64_t ignored_bucket_span = 0;
+    if (!try_pow2_u64_shift(k, ignored_bucket_span)) {
+        return empty_result();
+    }
 
     last_streaming_parameters.k = k;
     last_streaming_parameters.l = l;

From 0c11002ba3aabb9cdb2f65bf31d9c8cd5393fb73 Mon Sep 17 00:00:00 2001
From: Gene Hoffman <hoffmang@hoffmang.com>
Date: Tue, 12 May 2026 19:17:42 -0700
Subject: [PATCH 10/13] Harden fast counter slot safety and macOS cmake setup.

Add compile-time guards that reject zero fast-counter slot configurations before modulo indexing, and export Homebrew's cmake path in macOS workflows so cmake is available within the same step on Intel runners.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .github/workflows/build-c-libraries.yml | 1 +
 .github/workflows/build.yml             | 1 +
 src/threading.h                         | 2 ++
 src/vdf.h                               | 1 +
 4 files changed, 5 insertions(+)

diff --git a/.github/workflows/build-c-libraries.yml b/.github/workflows/build-c-libraries.yml
index db833104..451fabee 100644
--- a/.github/workflows/build-c-libraries.yml
+++ b/.github/workflows/build-c-libraries.yml
@@ -90,6 +90,7 @@ jobs:
         CMAKE_BIN="$(brew --prefix cmake)/bin"
         if [ -d "$CMAKE_BIN" ]; then
           echo "$CMAKE_BIN" >> "$GITHUB_PATH"
+          export PATH="$CMAKE_BIN:$PATH"
         fi
         cmake --version
 
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 4ad967ec..798241d7 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -110,6 +110,7 @@ jobs:
         CMAKE_BIN="$(brew --prefix cmake)/bin"
         if [ -d "$CMAKE_BIN" ]; then
           echo "$CMAKE_BIN" >> "$GITHUB_PATH"
+          export PATH="$CMAKE_BIN:$PATH"
         fi
         cmake --version
 
diff --git a/src/threading.h b/src/threading.h
index dbb18592..3574a98f 100644
--- a/src/threading.h
+++ b/src/threading.h
@@ -570,6 +570,8 @@ struct alignas(64) thread_counter {
 #define CHIA_VDF_FAST_COUNTER_SLOTS 100
 #endif
 
+static_assert(CHIA_VDF_FAST_COUNTER_SLOTS > 0, "CHIA_VDF_FAST_COUNTER_SLOTS must be > 0");
+
 thread_counter master_counter[CHIA_VDF_FAST_COUNTER_SLOTS];
 thread_counter slave_counter[CHIA_VDF_FAST_COUNTER_SLOTS];
 
diff --git a/src/vdf.h b/src/vdf.h
index 575cc78a..eb1d0d39 100644
--- a/src/vdf.h
+++ b/src/vdf.h
@@ -102,6 +102,7 @@ inline std::atomic<unsigned int> vdf_fast_next_slot{0};
 inline int vdf_fast_pairindex() {
 #if (defined(ARCH_X86) || defined(ARCH_X64)) && !defined(CHIA_DISABLE_ASM)
     constexpr unsigned int kSlots = unsigned(sizeof(master_counter) / sizeof(master_counter[0]));
+    static_assert(kSlots > 0, "CHIA_VDF_FAST_COUNTER_SLOTS must be > 0");
     thread_local int slot = int(vdf_fast_next_slot.fetch_add(1u, std::memory_order_relaxed) % kSlots);
     return slot;
 #else

From 1e7342199beb8e0d199f2201e6040ef5b4f6ca94 Mon Sep 17 00:00:00 2001
From: Gene Hoffman <hoffmang@hoffmang.com>
Date: Tue, 12 May 2026 19:37:23 -0700
Subject: [PATCH 11/13] Remove stale patch artifact and refine tuner update
 cost.

Drop the root-level development patch file that diverged from the live implementation, and adjust the streaming tuner cost model so bucket-update work scales with checkpoint count and `l` instead of only `k`.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 pr1_upstream_ready.patch        | 1158 -------------------------------
 src/c_bindings/fast_wrapper.cpp |    5 +-
 2 files changed, 3 insertions(+), 1160 deletions(-)
 delete mode 100644 pr1_upstream_ready.patch

diff --git a/pr1_upstream_ready.patch b/pr1_upstream_ready.patch
deleted file mode 100644
index b14a93bb..00000000
--- a/pr1_upstream_ready.patch
+++ /dev/null
@@ -1,1158 +0,0 @@
-diff --git a/src/Makefile.vdf-client b/src/Makefile.vdf-client
-index ed41963..ca55a95 100644
---- a/src/Makefile.vdf-client
-+++ b/src/Makefile.vdf-client
-@@ -6,9 +6,24 @@ else
- NOPIE = -no-pie
- endif
- 
--LDFLAGS += -flto $(NOPIE) -g
-+# Optional: override `LTO=` to disable link-time optimization.
-+LTO ?= -flto
-+
-+# Optional: set `PIC=1` to build position-independent objects (recommended when
-+# linking chiavdf code into other PIE/shared-library binaries).
-+PIC ?= 0
-+ifeq ($(PIC),1)
-+PICFLAGS = -fPIC
-+PIEFLAGS =
-+else
-+PICFLAGS =
-+PIEFLAGS = $(NOPIE)
-+endif
-+
-+LDFLAGS += $(LTO) $(PIEFLAGS) -g
- LDLIBS += -lgmpxx -lgmp -pthread
--CXXFLAGS += -flto -std=c++1z -D VDF_MODE=0 -D FAST_MACHINE=1 -pthread $(NOPIE) -fvisibility=hidden
-+CXXFLAGS += $(LTO) -std=c++1z -D VDF_MODE=0 -D FAST_MACHINE=1 -pthread $(PIEFLAGS) $(PICFLAGS) -fvisibility=hidden
-+ASFLAGS += $(PICFLAGS)
- ifeq ($(UNAME),Darwin)
- CXXFLAGS += -D CHIAOSX=1
- endif
-@@ -31,7 +46,7 @@ BINS = vdf_client prover_test 1weso_test 2weso_test vdf_bench
- all: $(BINS)
- 
- clean:
--	rm -f *.o hw/*.o $(BINS) compile_asm emu_hw_test hw_test hw_vdf_client emu_hw_vdf_client
-+	rm -f *.o hw/*.o c_bindings/*.o $(BINS) compile_asm emu_hw_test hw_test hw_vdf_client emu_hw_vdf_client libchiavdf_fastc.a
- 
- $(BINS) avx512_test: %: %.o lzcnt.o asm_compiled.o avx2_asm_compiled.o avx512_asm_compiled.o
- 	$(CXX) $(LDFLAGS) -o $@ $^ $(LDLIBS)
-@@ -39,7 +54,10 @@ $(BINS) avx512_test: %: %.o lzcnt.o asm_compiled.o avx2_asm_compiled.o avx512_as
- $(addsuffix .o,$(BINS)) avx512_test.o: CXXFLAGS += $(OPT_CFLAGS)
- 
- lzcnt.o: refcode/lzcnt.c
--	$(CC) -c refcode/lzcnt.c
-+	$(CC) -c refcode/lzcnt.c $(OPT_CFLAGS) $(PICFLAGS)
-+
-+%.o: %.s
-+	$(CC) -c $< -o $@ $(ASFLAGS)
- 
- asm_compiled.s: compile_asm
- 	./compile_asm
-@@ -53,6 +71,22 @@ avx512_asm_compiled.s: compile_asm
- compile_asm: compile_asm.o
- 	$(CXX) $(LDFLAGS) -o $@ $^ $(LDLIBS)
- 
-+# ---------------------------------------------------------------------------
-+# Static library: fast one-wesolowski proof (BBR integration)
-+# ---------------------------------------------------------------------------
-+
-+FASTLIB = libchiavdf_fastc.a
-+FASTLIB_OBJS = c_bindings/fast_wrapper.o lzcnt.o asm_compiled.o avx2_asm_compiled.o avx512_asm_compiled.o
-+
-+.PHONY: fastlib
-+
-+fastlib: $(FASTLIB)
-+
-+$(FASTLIB): $(FASTLIB_OBJS)
-+	$(AR) rcs $@ $^
-+
-+c_bindings/fast_wrapper.o: CXXFLAGS += $(OPT_CFLAGS)
-+
- HW_OBJS = $(addprefix hw/,hw_util.o hw_proof.o hw_interface.o chia_driver.o ftdi_driver.o vdf_driver.o pll_freqs.o) vdf_base.o lzcnt.o
- EMU_OBJS = hw/emu_funcs.o hw/emu_runner.o
- HW_LIB = hw/libft4222/build-x86_64/libft4222.so
-diff --git a/src/c_bindings/fast_wrapper.cpp b/src/c_bindings/fast_wrapper.cpp
-new file mode 100644
-index 0000000..198d0a8
---- /dev/null
-+++ b/src/c_bindings/fast_wrapper.cpp
-@@ -0,0 +1,795 @@
-+#include "fast_wrapper.h"
-+
-+#include <atomic>
-+#include <chrono>
-+#include <limits>
-+#include <mutex>
-+#include <vector>
-+
-+#include "../vdf.h"
-+#include "../create_discriminant.h"
-+
-+// Runtime configuration knobs required by `parameters.h`.
-+// These are `extern` variables there, but each binary defines them explicitly.
-+bool use_divide_table = false;
-+int gcd_base_bits = 50;
-+int gcd_128_max_iter = 3;
-+std::string asmprefix = "cel_";
-+bool enable_all_instructions = false;
-+
-+namespace {
-+std::once_flag init_once;
-+std::atomic<uint64_t> bucket_memory_budget_bytes(128ULL * 1024ULL * 1024ULL);
-+std::atomic<bool> streaming_stats_enabled(false);
-+
-+struct LastStreamingParameters {
-+    uint32_t k = 0;
-+    uint32_t l = 0;
-+    bool tuned = false;
-+    bool set = false;
-+};
-+
-+thread_local LastStreamingParameters last_streaming_parameters;
-+
-+struct LastStreamingStats {
-+    uint64_t checkpoint_total_ns = 0;
-+    uint64_t checkpoint_event_total_ns = 0;
-+    uint64_t finalize_total_ns = 0;
-+    uint64_t checkpoint_calls = 0;
-+    uint64_t bucket_updates = 0;
-+    bool set = false;
-+};
-+
-+thread_local LastStreamingStats last_streaming_stats;
-+
-+void init_chiavdf_fast() {
-+    init_gmp();
-+    set_rounding_mode();
-+
-+    // Match the vdf_client runtime selection for AVX2.
-+    if (hasAVX2()) {
-+        gcd_base_bits = 63;
-+        gcd_128_max_iter = 2;
-+    } else {
-+        gcd_base_bits = 50;
-+        gcd_128_max_iter = 3;
-+    }
-+
-+    // Ensure we run the one-wesolowski path by default.
-+    fast_algorithm = false;
-+    two_weso = false;
-+    quiet_mode = true;
-+}
-+
-+ChiavdfByteArray empty_result() { return ChiavdfByteArray{nullptr, 0}; }
-+
-+uint64_t estimate_bucket_form_bytes(size_t discriminant_size_bits) {
-+    // Be conservative: class group forms contain 3 GMP-backed integers that
-+    // quickly grow to the discriminant size (or beyond) during NUCOMP.
-+    //
-+    // This estimate is intentionally larger than the raw serialized size to
-+    // avoid picking parameters that risk paging/OOM.
-+    uint64_t discr_bytes = (static_cast<uint64_t>(discriminant_size_bits) + 7) / 8;
-+    uint64_t estimate = discr_bytes * 16;
-+    if (estimate < 2048) {
-+        estimate = 2048;
-+    }
-+    return estimate;
-+}
-+
-+bool tune_streaming_parameters(
-+    uint64_t num_iterations,
-+    size_t discriminant_size_bits,
-+    uint64_t memory_budget_bytes,
-+    uint32_t& out_l,
-+    uint32_t& out_k) {
-+    if (memory_budget_bytes == 0) {
-+        return false;
-+    }
-+
-+    // Keep headroom for GMP scratch allocations and general process overhead.
-+    uint64_t budget = (memory_budget_bytes * 80) / 100;
-+    uint64_t bytes_per_form = estimate_bucket_form_bytes(discriminant_size_bits);
-+    if (budget < bytes_per_form) {
-+        return false;
-+    }
-+
-+    unsigned __int128 best_cost = std::numeric_limits<unsigned __int128>::max();
-+    bool found = false;
-+
-+    // Empirical tuning notes (1024-bit discriminants, AVX2 build):
-+    // - Each bucket update (NUCOMP) and each fold unit is ~5µs.
-+    // - Per-checkpoint event overhead (SetForm + bookkeeping) is ~0.3µs.
-+    //
-+    // So checkpoint counts should be weighted much lower than updates/fold.
-+    constexpr unsigned __int128 update_weight = 16;
-+    constexpr unsigned __int128 fold_weight = 16;
-+    constexpr unsigned __int128 checkpoint_weight = 1;
-+
-+    // Search a small grid of `(k,l)` values. Higher `k` reduces checkpoint work
-+    // (~T/k) but increases fold work (~l·2^k) and bucket memory (~l·2^k).
-+    for (uint32_t k = 4; k <= 20; k++) {
-+        unsigned __int128 buckets_per_row = static_cast<unsigned __int128>(1) << k;
-+
-+        for (uint32_t l = 1; l <= 64; l++) {
-+            unsigned __int128 form_count = buckets_per_row * static_cast<unsigned __int128>(l);
-+            unsigned __int128 mem_required =
-+                form_count * static_cast<unsigned __int128>(bytes_per_form);
-+            if (mem_required > static_cast<unsigned __int128>(budget)) {
-+                continue;
-+            }
-+
-+            unsigned __int128 updates = static_cast<unsigned __int128>(
-+                (num_iterations + static_cast<uint64_t>(k) - 1) / static_cast<uint64_t>(k));
-+            uint64_t kl = static_cast<uint64_t>(k) * static_cast<uint64_t>(l);
-+            unsigned __int128 checkpoints = static_cast<unsigned __int128>(
-+                (num_iterations + kl - 1) / kl);
-+            unsigned __int128 fold = static_cast<unsigned __int128>(l) << (k + 1);
-+            unsigned __int128 cost =
-+                updates * update_weight + checkpoints * checkpoint_weight + fold * fold_weight;
-+
-+            if (!found || cost < best_cost) {
-+                found = true;
-+                best_cost = cost;
-+                out_k = k;
-+                out_l = l;
-+            }
-+        }
-+    }
-+
-+    return found;
-+}
-+
-+uint64_t get_block(uint64_t i, uint64_t k, uint64_t T, integer& B) {
-+    integer res = FastPow(2, T - k * (i + 1), B);
-+    mpz_mul_2exp(res.impl, res.impl, k);
-+    res = res / B;
-+    auto res_vector = res.to_vector();
-+    return res_vector.empty() ? 0 : res_vector[0];
-+}
-+
-+class ProgressOneWesolowskiCallback final : public OneWesolowskiCallback {
-+  public:
-+    ProgressOneWesolowskiCallback(
-+        integer& D,
-+        form& f,
-+        uint64_t wanted_iter,
-+        uint64_t progress_interval,
-+        ChiavdfProgressCallback progress_cb,
-+        void* progress_user_data)
-+        : OneWesolowskiCallback(D, f, wanted_iter),
-+          progress_interval(progress_interval),
-+          progress_cb(progress_cb),
-+          progress_user_data(progress_user_data),
-+          next_progress(progress_interval) {}
-+
-+    void OnIteration(int type, void* data, uint64_t iteration) override {
-+        OneWesolowskiCallback::OnIteration(type, data, iteration);
-+
-+        if (progress_cb == nullptr || progress_interval == 0) {
-+            return;
-+        }
-+
-+        uint64_t done = iteration + 1;
-+        if (done > wanted_iter) {
-+            return;
-+        }
-+
-+        if (done >= next_progress) {
-+            progress_cb(next_progress, progress_user_data);
-+            next_progress += progress_interval;
-+        }
-+    }
-+
-+  private:
-+    uint64_t progress_interval;
-+    ChiavdfProgressCallback progress_cb;
-+    void* progress_user_data;
-+    uint64_t next_progress;
-+};
-+
-+class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
-+  public:
-+    StreamingOneWesolowskiCallback(
-+        integer& D,
-+        uint64_t wanted_iter,
-+        uint32_t k,
-+        uint32_t l,
-+        uint64_t limit,
-+        integer& B,
-+        bool use_getblock_opt,
-+        uint64_t progress_interval,
-+        ChiavdfProgressCallback progress_cb,
-+        void* progress_user_data)
-+        : WesolowskiCallback(D),
-+          wanted_iter(wanted_iter),
-+          k(k),
-+          l(l),
-+          kl(static_cast<uint64_t>(k) * static_cast<uint64_t>(l)),
-+          limit(limit),
-+          B(B),
-+          progress_interval(progress_interval),
-+          progress_cb(progress_cb),
-+          progress_user_data(progress_user_data),
-+          next_progress(progress_interval),
-+          use_getblock_opt(use_getblock_opt),
-+          stats_enabled(streaming_stats_enabled.load(std::memory_order_relaxed)) {
-+        form id = form::identity(D);
-+        buckets.resize(static_cast<size_t>(l) * (1ULL << k), id);
-+
-+        if (use_getblock_opt) {
-+            getblock_ok = init_getblock_opt_state();
-+        }
-+    }
-+
-+    void OnIteration(int type, void* data, uint64_t iteration) override {
-+        iteration++;
-+        if (iteration > wanted_iter) {
-+            return;
-+        }
-+
-+        if (progress_cb != nullptr && progress_interval != 0 && iteration >= next_progress) {
-+            progress_cb(next_progress, progress_user_data);
-+            next_progress += progress_interval;
-+        }
-+
-+        if (iteration % kl == 0) {
-+            uint64_t pos = iteration / kl;
-+            if (pos < limit) {
-+                form checkpoint;
-+                auto started_at = std::chrono::steady_clock::time_point{};
-+                if (stats_enabled) {
-+                    started_at = std::chrono::steady_clock::now();
-+                }
-+                SetForm(type, data, &checkpoint);
-+                process_checkpoint(pos, checkpoint, /*record_stats=*/true);
-+                if (stats_enabled) {
-+                    checkpoint_event_total_ns += static_cast<uint64_t>(
-+                        std::chrono::duration_cast<std::chrono::nanoseconds>(
-+                            std::chrono::steady_clock::now() - started_at)
-+                            .count());
-+                }
-+            }
-+        }
-+
-+        if (iteration == wanted_iter) {
-+            SetForm(type, data, &result);
-+            has_result = true;
-+        }
-+    }
-+
-+    void process_checkpoint(uint64_t i, const form& checkpoint, bool record_stats) {
-+        const bool do_stats = stats_enabled && record_stats;
-+        auto started_at = std::chrono::steady_clock::time_point{};
-+        if (do_stats) {
-+            started_at = std::chrono::steady_clock::now();
-+        }
-+
-+        uint64_t local_updates = 0;
-+        for (uint32_t j = 0; j < l; j++) {
-+            uint64_t p = i * static_cast<uint64_t>(l) + static_cast<uint64_t>(j);
-+            uint64_t needed = static_cast<uint64_t>(k) * (p + 1);
-+            if (wanted_iter < needed) {
-+                break;
-+            }
-+            uint64_t b = use_getblock_opt ? get_block_opt(p) : get_block(p, k, wanted_iter, B);
-+            if (do_stats) {
-+                local_updates++;
-+            }
-+            nucomp_form(bucket(j, b), bucket(j, b), checkpoint, D, L);
-+        }
-+
-+        if (do_stats) {
-+            checkpoint_calls++;
-+            bucket_updates += local_updates;
-+            checkpoint_total_ns += static_cast<uint64_t>(
-+                std::chrono::duration_cast<std::chrono::nanoseconds>(
-+                    std::chrono::steady_clock::now() - started_at)
-+                    .count());
-+        }
-+    }
-+
-+    bool init_ok() const { return getblock_ok; }
-+
-+    bool ok() const { return has_result; }
-+
-+    const form& y() const { return result; }
-+
-+    form finalize_proof() {
-+        auto started_at = std::chrono::steady_clock::time_point{};
-+        if (stats_enabled) {
-+            started_at = std::chrono::steady_clock::now();
-+        }
-+
-+        PulmarkReducer reducer;
-+        form id = form::identity(D);
-+
-+        uint64_t k1 = k / 2;
-+        uint64_t k0 = k - k1;
-+        form x = id;
-+
-+        for (int64_t j = static_cast<int64_t>(l) - 1; j >= 0; j--) {
-+            x = FastPowFormNucomp(x, D, integer(static_cast<uint64_t>(1) << k), L, reducer);
-+
-+            for (uint64_t b1 = 0; b1 < (1ULL << k1); b1++) {
-+                form z = id;
-+                for (uint64_t b0 = 0; b0 < (1ULL << k0); b0++) {
-+                    nucomp_form(z, z, bucket(static_cast<uint32_t>(j), b1 * (1ULL << k0) + b0), D, L);
-+                }
-+                z = FastPowFormNucomp(
-+                    z,
-+                    D,
-+                    integer(static_cast<uint64_t>(b1 * (1ULL << k0))),
-+                    L,
-+                    reducer);
-+                nucomp_form(x, x, z, D, L);
-+            }
-+
-+            for (uint64_t b0 = 0; b0 < (1ULL << k0); b0++) {
-+                form z = id;
-+                for (uint64_t b1 = 0; b1 < (1ULL << k1); b1++) {
-+                    nucomp_form(z, z, bucket(static_cast<uint32_t>(j), b1 * (1ULL << k0) + b0), D, L);
-+                }
-+                z = FastPowFormNucomp(z, D, integer(b0), L, reducer);
-+                nucomp_form(x, x, z, D, L);
-+            }
-+        }
-+
-+        reducer.reduce(x);
-+
-+        if (stats_enabled) {
-+            finalize_total_ns += static_cast<uint64_t>(
-+                std::chrono::duration_cast<std::chrono::nanoseconds>(
-+                    std::chrono::steady_clock::now() - started_at)
-+                    .count());
-+        }
-+        return x;
-+    }
-+
-+    bool stats_ok() const { return stats_enabled; }
-+
-+    LastStreamingStats stats() const {
-+        LastStreamingStats out;
-+        out.checkpoint_total_ns = checkpoint_total_ns;
-+        out.checkpoint_event_total_ns = checkpoint_event_total_ns;
-+        out.finalize_total_ns = finalize_total_ns;
-+        out.checkpoint_calls = checkpoint_calls;
-+        out.bucket_updates = bucket_updates;
-+        out.set = stats_enabled;
-+        return out;
-+    }
-+
-+  private:
-+    form& bucket(uint32_t j, uint64_t b) {
-+        size_t idx = static_cast<size_t>(j) * (1ULL << k) + static_cast<size_t>(b);
-+        return buckets[idx];
-+    }
-+
-+    const form& bucket(uint32_t j, uint64_t b) const {
-+        size_t idx = static_cast<size_t>(j) * (1ULL << k) + static_cast<size_t>(b);
-+        return buckets[idx];
-+    }
-+
-+    uint64_t wanted_iter;
-+    uint32_t k;
-+    uint32_t l;
-+    uint64_t kl;
-+    uint64_t limit;
-+    integer B;
-+    uint64_t progress_interval;
-+    ChiavdfProgressCallback progress_cb;
-+    void* progress_user_data;
-+    uint64_t next_progress;
-+
-+    std::vector<form> buckets;
-+    form result;
-+    bool has_result = false;
-+
-+    bool use_getblock_opt;
-+    bool getblock_ok = true;
-+    uint64_t getblock_next_p = 0;
-+    integer getblock_inv_2k;
-+    integer getblock_r;
-+    integer getblock_tmp;
-+
-+    bool stats_enabled;
-+    uint64_t checkpoint_total_ns = 0;
-+    uint64_t checkpoint_event_total_ns = 0;
-+    uint64_t finalize_total_ns = 0;
-+    uint64_t checkpoint_calls = 0;
-+    uint64_t bucket_updates = 0;
-+
-+    bool init_getblock_opt_state() {
-+        if (k == 0) {
-+            return false;
-+        }
-+        uint64_t k_u64 = static_cast<uint64_t>(k);
-+        if (wanted_iter < k_u64) {
-+            return true;
-+        }
-+
-+        integer two_k_mod = FastPow(2, k_u64, B);
-+        if (mpz_invert(getblock_inv_2k.impl, two_k_mod.impl, B.impl) == 0) {
-+            return false;
-+        }
-+
-+        getblock_r = FastPow(2, wanted_iter - k_u64, B);
-+        getblock_next_p = 0;
-+        return true;
-+    }
-+
-+    uint64_t get_block_opt(uint64_t p) {
-+        if (!getblock_ok || wanted_iter < static_cast<uint64_t>(k)) {
-+            return get_block(p, k, wanted_iter, B);
-+        }
-+
-+        // Expected call pattern is sequential `p`. If we ever get out of sync,
-+        // advance state forward or fall back to the slow mapping.
-+        if (p < getblock_next_p) {
-+            return get_block(p, k, wanted_iter, B);
-+        }
-+        while (getblock_next_p < p) {
-+            mpz_mul(getblock_r.impl, getblock_r.impl, getblock_inv_2k.impl);
-+            mpz_mod(getblock_r.impl, getblock_r.impl, B.impl);
-+            getblock_next_p++;
-+        }
-+
-+        mpz_mul_2exp(getblock_tmp.impl, getblock_r.impl, k);
-+        mpz_fdiv_q(getblock_tmp.impl, getblock_tmp.impl, B.impl);
-+        uint64_t b = mpz_get_ui(getblock_tmp.impl);
-+
-+        mpz_mul(getblock_r.impl, getblock_r.impl, getblock_inv_2k.impl);
-+        mpz_mod(getblock_r.impl, getblock_r.impl, B.impl);
-+        getblock_next_p++;
-+
-+        return b;
-+    }
-+};
-+
-+ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_impl(
-+    const uint8_t* challenge_hash,
-+    size_t challenge_size,
-+    const uint8_t* x_s,
-+    size_t x_s_size,
-+    const uint8_t* y_ref_s,
-+    size_t y_ref_s_size,
-+    size_t discriminant_size_bits,
-+    uint64_t num_iterations,
-+    uint64_t progress_interval,
-+    ChiavdfProgressCallback progress_cb,
-+    void* progress_user_data,
-+    bool use_getblock_opt) {
-+    std::call_once(init_once, init_chiavdf_fast);
-+
-+    last_streaming_stats = LastStreamingStats{};
-+
-+    if (challenge_hash == nullptr || challenge_size == 0 || x_s == nullptr || x_s_size == 0 ||
-+        y_ref_s == nullptr || y_ref_s_size == 0) {
-+        return empty_result();
-+    }
-+    if (num_iterations == 0) {
-+        return empty_result();
-+    }
-+
-+    std::vector<uint8_t> challenge_hash_bytes(challenge_hash, challenge_hash + challenge_size);
-+    integer D = CreateDiscriminant(challenge_hash_bytes, static_cast<int>(discriminant_size_bits));
-+    integer L = root(-D, 4);
-+
-+    form x = DeserializeForm(D, x_s, x_s_size);
-+    form y_ref = DeserializeForm(D, y_ref_s, y_ref_s_size);
-+
-+    uint32_t k;
-+    uint32_t l;
-+    bool tuned = false;
-+    const uint64_t budget =
-+        bucket_memory_budget_bytes.load(std::memory_order_relaxed);
-+    if (num_iterations >= (1 << 16)) {
-+        tuned = tune_streaming_parameters(num_iterations, discriminant_size_bits, budget, l, k);
-+    }
-+    if (!tuned) {
-+        if (num_iterations >= (1 << 16)) {
-+            ApproximateParameters(num_iterations, l, k);
-+        } else {
-+            k = 10;
-+            l = 1;
-+        }
-+    }
-+    if (k == 0) {
-+        k = 1;
-+    }
-+    if (l == 0) {
-+        l = 1;
-+    }
-+
-+    last_streaming_parameters.k = k;
-+    last_streaming_parameters.l = l;
-+    last_streaming_parameters.tuned = tuned;
-+    last_streaming_parameters.set = true;
-+
-+    uint64_t kl = static_cast<uint64_t>(k) * static_cast<uint64_t>(l);
-+    uint64_t limit = num_iterations / kl;
-+    if (num_iterations % kl) {
-+        limit++;
-+    }
-+
-+    integer B = GetB(D, x, y_ref);
-+
-+    std::atomic<bool> stopped(false);
-+    StreamingOneWesolowskiCallback weso(
-+        D,
-+        num_iterations,
-+        k,
-+        l,
-+        limit,
-+        B,
-+        use_getblock_opt,
-+        progress_interval,
-+        progress_cb,
-+        progress_user_data);
-+
-+    if (!weso.init_ok()) {
-+        return empty_result();
-+    }
-+
-+    weso.process_checkpoint(/*i=*/0, x, /*record_stats=*/false);
-+
-+    FastStorage* fast_storage = nullptr;
-+    repeated_square(num_iterations, x, D, L, &weso, fast_storage, stopped);
-+
-+    if (!weso.ok()) {
-+        return empty_result();
-+    }
-+    if (!(weso.y() == y_ref)) {
-+        return empty_result();
-+    }
-+
-+    form proof_form = weso.finalize_proof();
-+
-+    if (weso.stats_ok()) {
-+        last_streaming_stats = weso.stats();
-+    }
-+
-+    int d_bits = D.num_bits();
-+    std::vector<unsigned char> y_serialized = SerializeForm(y_ref, d_bits);
-+    std::vector<unsigned char> proof_serialized = SerializeForm(proof_form, d_bits);
-+
-+    if (y_serialized.empty() || proof_serialized.empty()) {
-+        return empty_result();
-+    }
-+
-+    const size_t total = y_serialized.size() + proof_serialized.size();
-+    uint8_t* out = new uint8_t[total];
-+    std::copy(y_serialized.begin(), y_serialized.end(), out);
-+    std::copy(proof_serialized.begin(), proof_serialized.end(), out + y_serialized.size());
-+    return ChiavdfByteArray{out, total};
-+}
-+} // namespace
-+
-+extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast(
-+    const uint8_t* challenge_hash,
-+    size_t challenge_size,
-+    const uint8_t* x_s,
-+    size_t x_s_size,
-+    size_t discriminant_size_bits,
-+    uint64_t num_iterations) {
-+    return chiavdf_prove_one_weso_fast_with_progress(
-+        challenge_hash,
-+        challenge_size,
-+        x_s,
-+        x_s_size,
-+        discriminant_size_bits,
-+        num_iterations,
-+        /*progress_interval=*/0,
-+        /*progress_cb=*/nullptr,
-+        /*progress_user_data=*/nullptr);
-+}
-+
-+extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_with_progress(
-+    const uint8_t* challenge_hash,
-+    size_t challenge_size,
-+    const uint8_t* x_s,
-+    size_t x_s_size,
-+    size_t discriminant_size_bits,
-+    uint64_t num_iterations,
-+    uint64_t progress_interval,
-+    ChiavdfProgressCallback progress_cb,
-+    void* progress_user_data) {
-+    try {
-+        std::call_once(init_once, init_chiavdf_fast);
-+
-+        if (challenge_hash == nullptr || challenge_size == 0 || x_s == nullptr || x_s_size == 0) {
-+            return empty_result();
-+        }
-+        if (num_iterations == 0) {
-+            return empty_result();
-+        }
-+
-+        std::vector<uint8_t> challenge_hash_bytes(challenge_hash, challenge_hash + challenge_size);
-+        integer D = CreateDiscriminant(challenge_hash_bytes, static_cast<int>(discriminant_size_bits));
-+        integer L = root(-D, 4);
-+
-+        form x = DeserializeForm(D, x_s, x_s_size);
-+
-+        std::atomic<bool> stopped(false);
-+        ProgressOneWesolowskiCallback weso(
-+            D,
-+            x,
-+            num_iterations,
-+            progress_interval,
-+            progress_cb,
-+            progress_user_data);
-+
-+        // Run the fast repeated-squaring engine to `num_iterations`.
-+        // The callback stores all intermediates needed for the proof.
-+        FastStorage* fast_storage = nullptr;
-+        repeated_square(num_iterations, x, D, L, &weso, fast_storage, stopped);
-+
-+        // Now generate the compact proof from the stored intermediates.
-+        Proof proof = ProveOneWesolowski(num_iterations, D, x, &weso, stopped);
-+        if (proof.y.empty() || proof.proof.empty()) {
-+            return empty_result();
-+        }
-+
-+        const size_t total = proof.y.size() + proof.proof.size();
-+        uint8_t* out = new uint8_t[total];
-+        std::copy(proof.y.begin(), proof.y.end(), out);
-+        std::copy(proof.proof.begin(), proof.proof.end(), out + proof.y.size());
-+        return ChiavdfByteArray{out, total};
-+    } catch (...) {
-+        return empty_result();
-+    }
-+}
-+
-+extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming(
-+    const uint8_t* challenge_hash,
-+    size_t challenge_size,
-+    const uint8_t* x_s,
-+    size_t x_s_size,
-+    const uint8_t* y_ref_s,
-+    size_t y_ref_s_size,
-+    size_t discriminant_size_bits,
-+    uint64_t num_iterations) {
-+    return chiavdf_prove_one_weso_fast_streaming_with_progress(
-+        challenge_hash,
-+        challenge_size,
-+        x_s,
-+        x_s_size,
-+        y_ref_s,
-+        y_ref_s_size,
-+        discriminant_size_bits,
-+        num_iterations,
-+        /*progress_interval=*/0,
-+        /*progress_cb=*/nullptr,
-+        /*progress_user_data=*/nullptr);
-+}
-+
-+extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_with_progress(
-+    const uint8_t* challenge_hash,
-+    size_t challenge_size,
-+    const uint8_t* x_s,
-+    size_t x_s_size,
-+    const uint8_t* y_ref_s,
-+    size_t y_ref_s_size,
-+    size_t discriminant_size_bits,
-+    uint64_t num_iterations,
-+    uint64_t progress_interval,
-+    ChiavdfProgressCallback progress_cb,
-+    void* progress_user_data) {
-+    try {
-+        return chiavdf_prove_one_weso_fast_streaming_impl(
-+            challenge_hash,
-+            challenge_size,
-+            x_s,
-+            x_s_size,
-+            y_ref_s,
-+            y_ref_s_size,
-+            discriminant_size_bits,
-+            num_iterations,
-+            progress_interval,
-+            progress_cb,
-+            progress_user_data,
-+            /*use_getblock_opt=*/false);
-+    } catch (...) {
-+        return empty_result();
-+    }
-+}
-+
-+extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt(
-+    const uint8_t* challenge_hash,
-+    size_t challenge_size,
-+    const uint8_t* x_s,
-+    size_t x_s_size,
-+    const uint8_t* y_ref_s,
-+    size_t y_ref_s_size,
-+    size_t discriminant_size_bits,
-+    uint64_t num_iterations) {
-+    return chiavdf_prove_one_weso_fast_streaming_getblock_opt_with_progress(
-+        challenge_hash,
-+        challenge_size,
-+        x_s,
-+        x_s_size,
-+        y_ref_s,
-+        y_ref_s_size,
-+        discriminant_size_bits,
-+        num_iterations,
-+        /*progress_interval=*/0,
-+        /*progress_cb=*/nullptr,
-+        /*progress_user_data=*/nullptr);
-+}
-+
-+extern "C" ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt_with_progress(
-+    const uint8_t* challenge_hash,
-+    size_t challenge_size,
-+    const uint8_t* x_s,
-+    size_t x_s_size,
-+    const uint8_t* y_ref_s,
-+    size_t y_ref_s_size,
-+    size_t discriminant_size_bits,
-+    uint64_t num_iterations,
-+    uint64_t progress_interval,
-+    ChiavdfProgressCallback progress_cb,
-+    void* progress_user_data) {
-+    try {
-+        return chiavdf_prove_one_weso_fast_streaming_impl(
-+            challenge_hash,
-+            challenge_size,
-+            x_s,
-+            x_s_size,
-+            y_ref_s,
-+            y_ref_s_size,
-+            discriminant_size_bits,
-+            num_iterations,
-+            progress_interval,
-+            progress_cb,
-+            progress_user_data,
-+            /*use_getblock_opt=*/true);
-+    } catch (...) {
-+        return empty_result();
-+    }
-+}
-+
-+extern "C" void chiavdf_set_bucket_memory_budget_bytes(uint64_t bytes) {
-+    bucket_memory_budget_bytes.store(bytes, std::memory_order_relaxed);
-+}
-+
-+extern "C" void chiavdf_set_enable_streaming_stats(bool enable) {
-+    streaming_stats_enabled.store(enable, std::memory_order_relaxed);
-+    last_streaming_stats = LastStreamingStats{};
-+}
-+
-+extern "C" bool chiavdf_get_last_streaming_parameters(uint32_t* out_k, uint32_t* out_l, bool* out_tuned) {
-+    if (out_k == nullptr || out_l == nullptr || out_tuned == nullptr) {
-+        return false;
-+    }
-+    if (!last_streaming_parameters.set) {
-+        return false;
-+    }
-+    *out_k = last_streaming_parameters.k;
-+    *out_l = last_streaming_parameters.l;
-+    *out_tuned = last_streaming_parameters.tuned;
-+    return true;
-+}
-+
-+extern "C" bool chiavdf_get_last_streaming_stats(
-+    uint64_t* out_checkpoint_total_ns,
-+    uint64_t* out_checkpoint_event_total_ns,
-+    uint64_t* out_finalize_total_ns,
-+    uint64_t* out_checkpoint_calls,
-+    uint64_t* out_bucket_updates) {
-+    if (out_checkpoint_total_ns == nullptr || out_checkpoint_event_total_ns == nullptr ||
-+        out_finalize_total_ns == nullptr || out_checkpoint_calls == nullptr ||
-+        out_bucket_updates == nullptr) {
-+        return false;
-+    }
-+    if (!last_streaming_stats.set) {
-+        return false;
-+    }
-+    *out_checkpoint_total_ns = last_streaming_stats.checkpoint_total_ns;
-+    *out_checkpoint_event_total_ns = last_streaming_stats.checkpoint_event_total_ns;
-+    *out_finalize_total_ns = last_streaming_stats.finalize_total_ns;
-+    *out_checkpoint_calls = last_streaming_stats.checkpoint_calls;
-+    *out_bucket_updates = last_streaming_stats.bucket_updates;
-+    return true;
-+}
-+
-+extern "C" void chiavdf_free_byte_array(ChiavdfByteArray array) { delete[] array.data; }
-diff --git a/src/c_bindings/fast_wrapper.h b/src/c_bindings/fast_wrapper.h
-new file mode 100644
-index 0000000..bf33f32
---- /dev/null
-+++ b/src/c_bindings/fast_wrapper.h
-@@ -0,0 +1,145 @@
-+#pragma once
-+
-+#include <stdbool.h>
-+#include <stddef.h>
-+#include <stdint.h>
-+
-+#ifdef __cplusplus
-+extern "C" {
-+#endif
-+
-+typedef struct {
-+    uint8_t* data;
-+    size_t length;
-+} ChiavdfByteArray;
-+
-+typedef void (*ChiavdfProgressCallback)(uint64_t iters_done, void* user_data);
-+
-+// Configure the per-process memory budget used by the parameter tuner when
-+// selecting `(k,l)` for streaming/bucket-based proving.
-+//
-+// The budget is per worker process (not global across multiple processes).
-+//
-+// If `bytes` is 0, the default chiavdf heuristic is used.
-+void chiavdf_set_bucket_memory_budget_bytes(uint64_t bytes);
-+
-+// Debug helper: returns the `(k,l)` parameters selected for the most recent
-+// streaming proof computed on the current thread.
-+//
-+// Returns true if parameters are available.
-+bool chiavdf_get_last_streaming_parameters(uint32_t* out_k, uint32_t* out_l, bool* out_tuned);
-+
-+// Enable lightweight timing counters for the streaming prover.
-+//
-+// When enabled, the native library records basic timing counters for the most
-+// recent streaming proof computed on the current thread. This is intended for
-+// benchmarking and tuning; production runs should keep this disabled to avoid
-+// extra overhead.
-+void chiavdf_set_enable_streaming_stats(bool enable);
-+
-+// Debug helper: returns timing counters for the most recent streaming proof on
-+// the current thread.
-+//
-+// Returns true if stats are available (i.e. stats enabled and a streaming proof
-+// was computed successfully).
-+bool chiavdf_get_last_streaming_stats(
-+    uint64_t* out_checkpoint_total_ns,
-+    uint64_t* out_checkpoint_event_total_ns,
-+    uint64_t* out_finalize_total_ns,
-+    uint64_t* out_checkpoint_calls,
-+    uint64_t* out_bucket_updates);
-+
-+// Computes a compact (witness_type=0) Wesolowski proof using the fast engine.
-+//
-+// On success, returns `y || proof` where:
-+// - `y` is the serialized output form (typically 100 bytes for 1024-bit discriminants)
-+// - `proof` is the serialized witness form (same size as `y`)
-+//
-+// On failure, returns `{NULL, 0}`.
-+ChiavdfByteArray chiavdf_prove_one_weso_fast(
-+    const uint8_t* challenge_hash,
-+    size_t challenge_size,
-+    const uint8_t* x_s,
-+    size_t x_s_size,
-+    size_t discriminant_size_bits,
-+    uint64_t num_iterations);
-+
-+// Same as `chiavdf_prove_one_weso_fast`, but optionally invokes `progress_cb` from
-+// the proving thread every `progress_interval` iterations completed.
-+//
-+// If `progress_cb` is NULL or `progress_interval` is 0, no progress is reported.
-+ChiavdfByteArray chiavdf_prove_one_weso_fast_with_progress(
-+    const uint8_t* challenge_hash,
-+    size_t challenge_size,
-+    const uint8_t* x_s,
-+    size_t x_s_size,
-+    size_t discriminant_size_bits,
-+    uint64_t num_iterations,
-+    uint64_t progress_interval,
-+    ChiavdfProgressCallback progress_cb,
-+    void* progress_user_data);
-+
-+// Computes a compact (witness_type=0) Wesolowski proof using the "streaming"
-+// bucket-accumulation algorithm (Trick 1), which requires the expected output
-+// `y_ref` up front (as used by bluebox compaction jobs).
-+//
-+// On success, returns `y || proof` (same format as `chiavdf_prove_one_weso_fast`).
-+ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming(
-+    const uint8_t* challenge_hash,
-+    size_t challenge_size,
-+    const uint8_t* x_s,
-+    size_t x_s_size,
-+    const uint8_t* y_ref_s,
-+    size_t y_ref_s_size,
-+    size_t discriminant_size_bits,
-+    uint64_t num_iterations);
-+
-+// Same as `chiavdf_prove_one_weso_fast_streaming`, but optionally invokes
-+// `progress_cb` from the proving thread every `progress_interval` iterations.
-+ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_with_progress(
-+    const uint8_t* challenge_hash,
-+    size_t challenge_size,
-+    const uint8_t* x_s,
-+    size_t x_s_size,
-+    const uint8_t* y_ref_s,
-+    size_t y_ref_s_size,
-+    size_t discriminant_size_bits,
-+    uint64_t num_iterations,
-+    uint64_t progress_interval,
-+    ChiavdfProgressCallback progress_cb,
-+    void* progress_user_data);
-+
-+// Same as `chiavdf_prove_one_weso_fast_streaming`, but with an optimized
-+// implementation of the `GetBlock()` mapping (avoids per-block modular
-+// exponentiation without allocating a full `GetBlock` table).
-+ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt(
-+    const uint8_t* challenge_hash,
-+    size_t challenge_size,
-+    const uint8_t* x_s,
-+    size_t x_s_size,
-+    const uint8_t* y_ref_s,
-+    size_t y_ref_s_size,
-+    size_t discriminant_size_bits,
-+    uint64_t num_iterations);
-+
-+// Same as `chiavdf_prove_one_weso_fast_streaming_getblock_opt`, but optionally
-+// invokes `progress_cb` from the proving thread every `progress_interval`
-+// iterations.
-+ChiavdfByteArray chiavdf_prove_one_weso_fast_streaming_getblock_opt_with_progress(
-+    const uint8_t* challenge_hash,
-+    size_t challenge_size,
-+    const uint8_t* x_s,
-+    size_t x_s_size,
-+    const uint8_t* y_ref_s,
-+    size_t y_ref_s_size,
-+    size_t discriminant_size_bits,
-+    uint64_t num_iterations,
-+    uint64_t progress_interval,
-+    ChiavdfProgressCallback progress_cb,
-+    void* progress_user_data);
-+
-+void chiavdf_free_byte_array(ChiavdfByteArray array);
-+
-+#ifdef __cplusplus
-+}
-+#endif
-diff --git a/src/threading.h b/src/threading.h
-index 50d4b49..f6344ad 100644
---- a/src/threading.h
-+++ b/src/threading.h
-@@ -564,8 +564,8 @@ struct alignas(64) thread_counter {
-     }
- };
- 
--thread_counter master_counter[100];
--thread_counter slave_counter[100];
-+thread_counter master_counter[512];
-+thread_counter slave_counter[512];
- 
- struct thread_state {
-     int pairindex;
-diff --git a/src/vdf.h b/src/vdf.h
-index 9ab4aef..4544fe2 100644
---- a/src/vdf.h
-+++ b/src/vdf.h
-@@ -78,6 +78,18 @@ std::mutex new_event_mutex, cout_lock;
- bool debug_mode = false;
- bool fast_algorithm = false;
- bool two_weso = false;
-+bool quiet_mode = false;
-+
-+// vdf_fast uses shared master/slave counters keyed by `square_state.pairindex`.
-+// The upstream chiavdf binaries run one VDF per process and hardcode `pairindex=0`.
-+// In embedded/multi-worker setups (like WesoForge), multiple VDF computations can
-+// run concurrently in the same process; they must not share a pairindex.
-+inline int vdf_fast_pairindex() {
-+    constexpr int kSlots = int(sizeof(master_counter) / sizeof(master_counter[0]));
-+    static std::atomic<int> next_slot{0};
-+    thread_local int slot = next_slot.fetch_add(1, std::memory_order_relaxed) % kSlots;
-+    return slot;
-+}
- 
- //always works
- void repeated_square_original(vdf_original &vdfo, form& f, const integer& D, const integer& L, uint64 base, uint64 iterations, INUDUPLListener *nuduplListener) {
-@@ -137,7 +149,7 @@ void repeated_square(uint64_t iterations, form f, const integer& D, const intege
- 
-         // This works single threaded
-         square_state_type square_state;
--        square_state.pairindex=0;
-+        square_state.pairindex=vdf_fast_pairindex();
- 
-         uint64 actual_iterations=repeated_square_fast(square_state, f, D, L, num_iterations, batch_size, weso);
- 
-@@ -236,10 +248,12 @@ void repeated_square(uint64_t iterations, form f, const integer& D, const intege
-             }
-         #endif
-     }
--    {
--        // this shouldn't be needed but avoids some false positive in TSAN
--        std::lock_guard<std::mutex> lk(cout_lock);
--        std::cout << "VDF loop finished. Total iters: " << num_iterations << "\n" << std::flush;
-+    if (!quiet_mode) {
-+        {
-+            // this shouldn't be needed but avoids some false positive in TSAN
-+            std::lock_guard<std::mutex> lk(cout_lock);
-+            std::cout << "VDF loop finished. Total iters: " << num_iterations << "\n" << std::flush;
-+        }
-     }
- 
-     #ifdef VDF_TEST
-@@ -275,11 +289,6 @@ Proof ProveOneWesolowski(uint64_t iters, integer& D, form f, OneWesolowskiCallba
-     proof_serialized = SerializeForm(proof_form, d_bits);
-     Proof proof(y_serialized, proof_serialized);
-     proof.witness_type = 0;
--    {
--        // this shouldn't be needed but avoids some false positive in TSAN
--        std::lock_guard<std::mutex> lk(cout_lock);
--        std::cout << "Got simple weso proof: " << proof.hex() << "\n";
--    }
-     return proof;
- }
- 
-diff --git a/docs/bluebox_compaction.md b/docs/bluebox_compaction.md
-new file mode 100644
-index 0000000..61cd1fd
---- /dev/null
-+++ b/docs/bluebox_compaction.md
-@@ -0,0 +1,49 @@
-+# Bluebox Compaction Optimizations
-+
-+This document describes the compaction-oriented proving path exposed by
-+`src/c_bindings/fast_wrapper.h` and implemented in
-+`src/c_bindings/fast_wrapper.cpp`.
-+
-+## Scope
-+
-+These APIs are intended for workloads where the expected VDF output (`y_ref`) is
-+already known up front (for example, bluebox compaction jobs). They are additive
-+and do not change the existing `c_wrapper` APIs.
-+
-+## Optimization 1: Streaming one-wesolowski
-+
-+Given `y_ref`, the prover computes:
-+
-+- `B = GetB(D, x, y_ref)` before squaring starts
-+
-+This enables a streaming algorithm that updates proof buckets at each
-+checkpoint during repeated squaring, instead of materializing the full
-+intermediate checkpoint array and scanning it after the loop. In practice this
-+substantially reduces memory usage for compaction workloads.
-+
-+## Optimization 2: Incremental GetBlock mapping
-+
-+For streaming checkpoint updates, bucket index selection repeatedly calls
-+`GetBlock(p, k, T, B)`. The optimized mode keeps a rolling modular state and
-+advances sequential `p` values incrementally, avoiding full modular
-+exponentiation per call and avoiding a large lookup table.
-+
-+## Optimization 3: Memory-budgeted (k, l) tuning
-+
-+The wrapper can tune `(k, l)` under a configured memory budget:
-+
-+- `chiavdf_set_bucket_memory_budget_bytes(...)`
-+
-+If no tuned candidate is found, the code falls back to the standard parameter
-+heuristics.
-+
-+## Operational Notes
-+
-+- The `fast_wrapper` code path sets one-wesolowski mode and uses `quiet_mode` to
-+  avoid unsolicited stdout noise when embedded in multi-worker clients.
-+- Thread-slot assignment for the fast VDF counters is per-thread via
-+  `vdf_fast_pairindex()`, avoiding slot collisions when multiple VDF computations
-+  run in one process.
-+- The production default for `enable_threads` in `parameters.h` is unchanged from
-+  upstream to preserve timelord expectations.
-+
diff --git a/src/c_bindings/fast_wrapper.cpp b/src/c_bindings/fast_wrapper.cpp
index af3bf805..ae834c84 100644
--- a/src/c_bindings/fast_wrapper.cpp
+++ b/src/c_bindings/fast_wrapper.cpp
@@ -169,11 +169,12 @@ bool tune_streaming_parameters(
                 continue;
             }
 
-            unsigned __int128 updates = static_cast<unsigned __int128>(
-                (num_iterations + static_cast<uint64_t>(k) - 1) / static_cast<uint64_t>(k));
             uint64_t kl = static_cast<uint64_t>(k) * static_cast<uint64_t>(l);
             unsigned __int128 checkpoints = static_cast<unsigned __int128>(
                 (num_iterations + kl - 1) / kl);
+            // Each checkpoint can trigger up to `l` bucket updates (one per sub-block).
+            // Model update work as checkpoint-count scaled by `l`.
+            unsigned __int128 updates = checkpoints * static_cast<unsigned __int128>(l);
             unsigned __int128 fold = static_cast<unsigned __int128>(l) << (k + 1);
             unsigned __int128 cost =
                 updates * update_weight + checkpoints * checkpoint_weight + fold * fold_weight;

From 61e9280ead4d4cd19caf8f0984b4ab562730ad6a Mon Sep 17 00:00:00 2001
From: Gene Hoffman <hoffmang@hoffmang.com>
Date: Tue, 12 May 2026 20:08:54 -0700
Subject: [PATCH 12/13] Adapt streaming callback checkpoint scheduling from
 fb0e2c2.

Replace per-iteration modulo checks with next-checkpoint tracking in the streaming callback, and integrate the scheduling update with batch replay boundaries so rollback/replay semantics remain correct in the current upstreamed implementation.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 src/c_bindings/fast_wrapper.cpp | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/src/c_bindings/fast_wrapper.cpp b/src/c_bindings/fast_wrapper.cpp
index ae834c84..c3351e50 100644
--- a/src/c_bindings/fast_wrapper.cpp
+++ b/src/c_bindings/fast_wrapper.cpp
@@ -295,6 +295,7 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
           progress_cb(progress_cb),
           progress_user_data(progress_user_data),
           next_progress(progress_interval),
+          next_checkpoint_t((limit <= 1 || kl == 0) ? std::numeric_limits<uint64_t>::max() : kl),
           use_getblock_opt(use_getblock_opt),
           stats_enabled(streaming_stats_enabled.load(std::memory_order_relaxed)) {
         form id = form::identity(D);
@@ -328,7 +329,7 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
             next_progress += progress_interval;
         }
 
-        if (iteration % kl == 0) {
+        if (iteration == next_checkpoint_t) {
             uint64_t pos = iteration / kl;
             if (pos < limit) {
                 form checkpoint;
@@ -348,6 +349,14 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
                             .count());
                 }
             }
+
+            const uint64_t next_pos = pos + 1;
+            if (next_pos < limit && kl != 0 &&
+                next_pos <= std::numeric_limits<uint64_t>::max() / kl) {
+                next_checkpoint_t = next_pos * kl;
+            } else {
+                next_checkpoint_t = std::numeric_limits<uint64_t>::max();
+            }
         }
 
         if (iteration == wanted_iter) {
@@ -361,6 +370,7 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
         if (batch_size == 0) {
             batch_start_iteration = 1;
             batch_end_iteration = 0;
+            next_checkpoint_t = std::numeric_limits<uint64_t>::max();
             return;
         }
         // `base_iteration` is the number of completed iterations before this batch.
@@ -371,6 +381,24 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
         } else {
             batch_end_iteration = base_iteration + batch_size;
         }
+
+        if (kl == 0 || limit <= 1) {
+            next_checkpoint_t = std::numeric_limits<uint64_t>::max();
+            return;
+        }
+
+        const uint64_t first_iteration = saturating_add_u64(base_iteration, 1);
+        const uint64_t numerator = saturating_add_u64(first_iteration, kl - 1);
+        uint64_t first_pos = numerator / kl;
+        if (first_pos == 0) {
+            first_pos = 1;
+        }
+
+        if (first_pos < limit && first_pos <= std::numeric_limits<uint64_t>::max() / kl) {
+            next_checkpoint_t = first_pos * kl;
+        } else {
+            next_checkpoint_t = std::numeric_limits<uint64_t>::max();
+        }
     }
 
     void OnBatchReplay(uint64_t base_iteration, uint64_t batch_size) override {
@@ -524,6 +552,7 @@ class StreamingOneWesolowskiCallback final : public WesolowskiCallback {
     ChiavdfProgressCallback progress_cb;
     void* progress_user_data;
     uint64_t next_progress;
+    uint64_t next_checkpoint_t = std::numeric_limits<uint64_t>::max();
     size_t bucket_span = 0;
 
     std::vector<form> buckets;

From 91a2af96cd83525c20badf67a53727daccf657db Mon Sep 17 00:00:00 2001
From: Gene Hoffman <hoffmang@hoffmang.com>
Date: Tue, 12 May 2026 20:35:42 -0700
Subject: [PATCH 13/13] Address slot reuse and logging consistency in vdf fast
 path.

Lease fast counter slots with per-slot in-use tracking so long-lived processes can recycle released slots safely, and restore the one-weso proof diagnostic behind quiet_mode to keep client logging behavior consistent.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 src/vdf.h | 49 +++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 45 insertions(+), 4 deletions(-)

diff --git a/src/vdf.h b/src/vdf.h
index eb1d0d39..92a56b78 100644
--- a/src/vdf.h
+++ b/src/vdf.h
@@ -94,8 +94,6 @@ bool quiet_mode = false;
 // In embedded/multi-worker setups (like WesoForge), multiple VDF computations can
 // run concurrently in the same process; they must not share a pairindex.
 #if (defined(ARCH_X86) || defined(ARCH_X64)) && !defined(CHIA_DISABLE_ASM)
-// Keep slot allocation state as one program-wide entity for all TUs that include
-// this header, so concurrent callers cannot recycle the same slot sequence.
 inline std::atomic<unsigned int> vdf_fast_next_slot{0};
 #endif
 
@@ -103,8 +101,45 @@ inline int vdf_fast_pairindex() {
 #if (defined(ARCH_X86) || defined(ARCH_X64)) && !defined(CHIA_DISABLE_ASM)
     constexpr unsigned int kSlots = unsigned(sizeof(master_counter) / sizeof(master_counter[0]));
     static_assert(kSlots > 0, "CHIA_VDF_FAST_COUNTER_SLOTS must be > 0");
-    thread_local int slot = int(vdf_fast_next_slot.fetch_add(1u, std::memory_order_relaxed) % kSlots);
-    return slot;
+    static std::array<std::atomic<bool>, kSlots> vdf_fast_slot_in_use{};
+    struct SlotLease {
+        std::array<std::atomic<bool>, kSlots>* slots = nullptr;
+        int slot = -1;
+        bool owns_slot = false;
+        ~SlotLease() {
+            if (owns_slot && slots != nullptr && slot >= 0) {
+                (*slots)[static_cast<size_t>(slot)].store(false, std::memory_order_release);
+            }
+        }
+    };
+
+    thread_local SlotLease lease;
+    if (lease.slot >= 0) {
+        return lease.slot;
+    }
+
+    lease.slots = &vdf_fast_slot_in_use;
+
+    const unsigned int start = vdf_fast_next_slot.fetch_add(1u, std::memory_order_relaxed);
+    for (unsigned int i = 0; i < kSlots; i++) {
+        const unsigned int candidate = (start + i) % kSlots;
+        bool expected = false;
+        if (vdf_fast_slot_in_use[candidate].compare_exchange_strong(
+                expected,
+                true,
+                std::memory_order_acq_rel,
+                std::memory_order_relaxed)) {
+            lease.slot = static_cast<int>(candidate);
+            lease.owns_slot = true;
+            return lease.slot;
+        }
+    }
+
+    // All slots are currently active. Reuse one as a best-effort fallback; the
+    // fast path has corruption detection and can fall back to slow squaring.
+    lease.slot = static_cast<int>(start % kSlots);
+    lease.owns_slot = false;
+    return lease.slot;
 #else
     return 0;
 #endif
@@ -367,6 +402,12 @@ Proof ProveOneWesolowski(uint64_t iters, integer& D, form f, OneWesolowskiCallba
     proof_serialized = SerializeForm(proof_form, d_bits);
     Proof proof(y_serialized, proof_serialized);
     proof.witness_type = 0;
+    if (!quiet_mode) {
+        // Keep proof diagnostics available for vdf_client while quiet_mode
+        // suppresses output in embedded library-mode call paths.
+        std::lock_guard<std::mutex> lk(cout_lock);
+        std::cout << "Got simple weso proof: " << proof.hex() << "\n";
+    }
     return proof;
 }