diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 6772e371..9a258d6e 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -20,21 +20,43 @@ permissions:
 
 jobs:
   fuzz_targets:
-    name: Run fuzzers
+    name: Run fuzzers (${{ matrix.target }})
     runs-on: ubuntu-latest
     env:
       CARGO_PROFILE_RELEASE_LTO: false
+    strategy:
+      fail-fast: false
+      matrix:
+        target:
+          - create_discriminant
+          - prove
+          - verify
+          - verify_n_wesolowski
     steps:
       - uses: actions/checkout@v6
       - uses: dtolnay/rust-toolchain@nightly
 
+      - name: Cache cargo registry + build artifacts
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cargo/bin
+            ~/.cargo/registry
+            ~/.cargo/git
+            target
+            rust_bindings/fuzz/corpus
+          key: ${{ runner.os }}-rust-fuzz-${{ hashFiles('Cargo.lock') }}
+
       - name: Install cargo-fuzz
-        run: cargo +nightly install cargo-fuzz
+        run: |
+          if ! command -v cargo-fuzz >/dev/null 2>&1; then
+            cargo +nightly install cargo-fuzz --locked
+          fi
 
-      - name: Cargo fuzz
+      - name: Cargo fuzz (${{ matrix.target }})
         run: |
           cd rust_bindings
-          cargo fuzz list | xargs -I "%" sh -c "cargo +nightly fuzz run % -- -max_total_time=600 || exit 255"
+          cargo +nightly fuzz run ${{ matrix.target }} -- -max_total_time=600
 
   lint:
     name: Lint
@@ -110,7 +132,10 @@ jobs:
 
       - name: Install libclang-dev on Linux
         if: matrix.os.name == 'Ubuntu'
-        run: sudo apt-get install libclang-dev -y
+        run: |
+          # Avoid transient 404s from stale apt indices / mirror lag.
+          sudo apt-get update -y -o Acquire::Retries=3
+          sudo apt-get install libclang-dev -y
 
       - name: Set up Rust
         uses: dtolnay/rust-toolchain@stable
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index bb8ae50d..729e19ee 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -17,7 +17,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [macos-13-intel, ubuntu-latest]
+        os: [macos-13-intel, macos-13-arm64, ubuntu-latest]
         config: [optimized=1, TSAN=1, ASAN=1]
 
     steps:
@@ -34,6 +34,8 @@ jobs:
     - name: Build vdf-client on Ubuntu
       if: startsWith(matrix.os, 'ubuntu')
       run: |
+        # Avoid transient 404s from stale apt indices / mirror lag.
+        sudo apt-get update -y -o Acquire::Retries=3
         sudo apt-get install libgmp-dev libboost-python-dev libpython3-dev libboost-system-dev build-essential -y
         cd src
         make ${{ matrix.config }} -f Makefile.vdf-client
@@ -54,7 +56,11 @@ jobs:
         echo "Running 2weso_test"
         ./2weso_test
         echo "Running prover_test"
-        ./prover_test
+        if [[ "${{ matrix.os }}" == ubuntu* ]]; then
+          ./prover_test
+        else
+          CHIAVDF_PROVER_TEST_FAST=1 ./prover_test
+        fi
 
     - name: Test vdf-client
       if: matrix.config != 'optimized=1'
@@ -73,7 +79,11 @@ jobs:
       run: |
         cd src
         echo "Running prover_test"
-        ./prover_test
+        if [[ "${{ matrix.os }}" == ubuntu* ]]; then
+          ./prover_test
+        else
+          CHIAVDF_PROVER_TEST_FAST=1 ./prover_test
+        fi
 
     - name: Benchmark vdf-client
       if: matrix.config == 'optimized=1'
diff --git a/README.md b/README.md
index a374a0cd..537cd261 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,8 @@ MacOS and Linux, chiavdf can be used to compile vdf_client and vdf_bench.
 vdf_client is the core VDF process that completes the Proof of Time submitted
 to it by the Timelord. The repo also includes a benchmarking tool to get a
 sense of the iterations per second of a given CPU called vdf_bench. Try
-`./vdf_bench square_asm 250000` for an ips estimate.
+`./vdf_bench square_asm 250000` for an ips estimate on x86/x64 (phased/asm
+pipeline). On non-x86 architectures, use `./vdf_bench square 250000` (NUDUPL).
 
 To build vdf_client set the environment variable BUILD_VDF_CLIENT to "Y".
 `export BUILD_VDF_CLIENT=Y`.
@@ -60,6 +61,16 @@ If you're running a timelord, the following tests are available, depending of wh
 
 Those tests will simulate the vdf_client and verify for correctness the produced proofs.
 
+Note: `./prover_test` defaults to a long soak/stress run. Set
+`CHIAVDF_PROVER_TEST_FAST=1` to run a short, CI-friendly correctness check.
+
+## Fuzzing
+
+Fuzz targets live under `rust_bindings/fuzz`. The `prove` target includes an
+iteration cap to avoid out-of-memory conditions in CI. If you want deeper
+iteration coverage, raise the cap in `rust_bindings/fuzz/fuzz_targets/prove.rs`
+after validating memory usage and exec/s on your runner.
+
 ## Contributing and workflow
 
 Contributions are welcome and more details are available in chia-blockchain's
diff --git a/pyproject.toml b/pyproject.toml
index 1bfd3a4a..55a04341 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,7 +32,10 @@ before-build = "python -m pip install --upgrade pip"
 
 [tool.cibuildwheel.macos]
 build-verbosity = 0
-before-all = "brew install gmp boost cmake"
+before-all = """
+brew --prefix --installed gmp >/dev/null 2>&1 || brew install gmp
+brew install boost cmake
+"""
 before-build = "python -m pip install --upgrade pip"
 environment = {MACOSX_DEPLOYMENT_TARGET="13", SYSTEM_VERSION_COMPAT=0, BUILD_VDF_CLIENT="N"}
 
diff --git a/rust_bindings/fuzz/fuzz_targets/prove.rs b/rust_bindings/fuzz/fuzz_targets/prove.rs
index 64a962cd..cea282e4 100644
--- a/rust_bindings/fuzz/fuzz_targets/prove.rs
+++ b/rust_bindings/fuzz/fuzz_targets/prove.rs
@@ -1,9 +1,32 @@
 #![no_main]
 
 use chiavdf::prove;
-use libfuzzer_sys::fuzz_target;
+use libfuzzer_sys::{fuzz_target, Corpus};
 
-fuzz_target!(|data: ([u8; 32], [u8; 100], u16)| {
+// Fuzzing `prove()` with unbounded `iters` can explode memory usage and runtime.
+// The cost of the underlying VDF prover is at least linear in `iters`, and in
+// practice can become superlinear due to internal allocation patterns. We have
+// observed OOM (exit 137) in CI when `iters` is allowed to reach the full `u16`
+// range, so we cap it to keep fuzzing stable and high-throughput.
+//
+// Why 4096:
+// - Large enough to exercise multiple loop iterations and proof paths beyond
+//   "toy" counts, preserving meaningful coverage.
+// - Small enough to keep inputs fast and avoid pathological allocations across
+//   typical CI memory limits.
+// - Selected empirically as a conservative upper bound given prior OOMs; it can
+//   be raised later if measurements show steady memory and acceptable exec/s.
+//
+// If you want deeper iteration coverage, consider a separate stress target or
+// a time/iteration-budgeted harness rather than unbounded fuzz inputs.
+const MAX_ITERS: u64 = 4096;
+
+fuzz_target!(|data: ([u8; 32], [u8; 100], u16)| -> Corpus {
     let (genesis_challenge, element, iters) = data;
-    prove(&genesis_challenge, &element, 1024, iters as u64);
+    let iters = iters as u64;
+    if iters > MAX_ITERS {
+        return Corpus::Reject;
+    }
+    prove(&genesis_challenge, &element, 1024, iters);
+    Corpus::Keep
 });
diff --git a/setup.py b/setup.py
index b9e1545c..595f8bcb 100644
--- a/setup.py
+++ b/setup.py
@@ -3,6 +3,7 @@
 import shutil
 import subprocess
 import sys
+from pathlib import Path
 
 from setuptools import Command, Extension, setup
 from setuptools.command.build import build
@@ -134,6 +135,11 @@ def build_extension(self, ext):
 build.sub_commands.append(("build_hook", lambda x: True))  # type: ignore
 install.sub_commands.append(("install_hook", lambda x: True))
 
+# Wheel metadata generation on Windows can run with a non-UTF8 default encoding.
+# Read `README.md` explicitly as UTF-8 so `long_description` is robust across runners.
+_readme_path = Path(__file__).resolve().parent / "README.md"
+_long_description = _readme_path.read_text(encoding="utf-8")
+
 setup(
     name="chiavdf",
     author="Florin Chirica",
@@ -141,7 +147,7 @@ def build_extension(self, ext):
     description="Chia vdf verification (wraps C++)",
     license="Apache-2.0",
     python_requires=">=3.9",
-    long_description=open("README.md").read(),
+    long_description=_long_description,
     long_description_content_type="text/markdown",
     url="https://github.com/Chia-Network/chiavdf",
     ext_modules=[CMakeExtension("chiavdf", "src")],
diff --git a/src/Makefile.vdf-client b/src/Makefile.vdf-client
index ed41963f..9b0aa5a5 100644
--- a/src/Makefile.vdf-client
+++ b/src/Makefile.vdf-client
@@ -1,16 +1,32 @@
 UNAME := $(shell uname)
+ARCH := $(shell uname -m)
 
 ifneq (,$(findstring clang, $(shell $(CXX) --version)))
 NOPIE = -fno-PIE
 else
 NOPIE = -no-pie
 endif
+# macOS arm64 ignores -no_pie and warns; omit to avoid deprecation warnings
+ifeq ($(UNAME),Darwin)
+ifneq ($(filter $(ARCH),arm64),)
+NOPIE =
+endif
+endif
 
 LDFLAGS += -flto $(NOPIE) -g
 LDLIBS += -lgmpxx -lgmp -pthread
 CXXFLAGS += -flto -std=c++1z -D VDF_MODE=0 -D FAST_MACHINE=1 -pthread $(NOPIE) -fvisibility=hidden
 ifeq ($(UNAME),Darwin)
 CXXFLAGS += -D CHIAOSX=1
+# Homebrew (common on macOS) installs boost/gmp to /opt/homebrew or /usr/local
+ifneq ($(wildcard /opt/homebrew/include/boost/asio.hpp),)
+CXXFLAGS += -I/opt/homebrew/include
+LDFLAGS += -L/opt/homebrew/lib
+endif
+ifneq ($(wildcard /usr/local/include/boost/asio.hpp),)
+CXXFLAGS += -I/usr/local/include
+LDFLAGS += -L/usr/local/lib
+endif
 endif
 
 OPT_CFLAGS = -O3 -g
@@ -27,13 +43,20 @@ endif
 
 .PHONY: all clean
 
+# Only x86_64 builds use the x86 asm objects
+ifeq ($(ARCH),x86_64)
+ASM_OBJS = asm_compiled.o avx2_asm_compiled.o avx512_asm_compiled.o
+else
+ASM_OBJS =
+endif
+
 BINS = vdf_client prover_test 1weso_test 2weso_test vdf_bench
 all: $(BINS)
 
 clean:
 	rm -f *.o hw/*.o $(BINS) compile_asm emu_hw_test hw_test hw_vdf_client emu_hw_vdf_client
 
-$(BINS) avx512_test: %: %.o lzcnt.o asm_compiled.o avx2_asm_compiled.o avx512_asm_compiled.o
+$(BINS) avx512_test: %: %.o lzcnt.o $(ASM_OBJS)
 	$(CXX) $(LDFLAGS) -o $@ $^ $(LDLIBS)
 
 $(addsuffix .o,$(BINS)) avx512_test.o: CXXFLAGS += $(OPT_CFLAGS)
diff --git a/src/avx512_integer.h b/src/avx512_integer.h
index 489408f8..9d683e7a 100644
--- a/src/avx512_integer.h
+++ b/src/avx512_integer.h
@@ -123,6 +123,7 @@ void mpz_impl_set_mul(
     const mpz<expected_size_a, padded_size_a>& a,
     const mpz<expected_size_b, padded_size_b>& b
 ) {
+#if defined(ARCH_X86) || defined(ARCH_X64)
     if (enable_avx512_ifma) {
         typename avx512_integer_for_size<expected_size_a>::i a_avx512;
         typename avx512_integer_for_size<expected_size_b>::i b_avx512;
@@ -132,7 +133,9 @@ void mpz_impl_set_mul(
         b_avx512=b;
         out_avx512.set_mul(a_avx512, b_avx512);
         out_avx512.assign(out);
-    } else {
+    } else
+#endif
+    {
         mpz_mul(out._(), a._(), b._());
     }
 }
diff --git a/src/callback.h b/src/callback.h
index 92207d1b..a10cb054 100644
--- a/src/callback.h
+++ b/src/callback.h
@@ -2,6 +2,7 @@
 #define CALLBACK_H
 
 #include "util.h"
+#include "nudupl_listener.h"
 
 // Applies to n-weso.
 const int kWindowSize = 20;
@@ -32,6 +33,7 @@ class WesolowskiCallback :public INUDUPLListener {
         switch(type) {
             case NL_SQUARESTATE:
             {
+#if defined(ARCH_X86) || defined(ARCH_X64)
                 //cout << "NL_SQUARESTATE" << endl;
                 uint64 res;
 
@@ -39,6 +41,11 @@ class WesolowskiCallback :public INUDUPLListener {
 
                 if(!square_state->assign(mulf->a, mulf->b, mulf->c, res))
                     cout << "square_state->assign failed" << endl;
+#else
+                // Phased pipeline is x86/x64-only.
+                (void)data;
+                cout << "NL_SQUARESTATE unsupported on this architecture" << endl;
+#endif
                 break;
             }
             case NL_FORM:
diff --git a/src/chiavdf_profile.h b/src/chiavdf_profile.h
new file mode 100644
index 00000000..f1c4c1d7
--- /dev/null
+++ b/src/chiavdf_profile.h
@@ -0,0 +1,40 @@
+#ifndef CHIAVDF_PROFILE_H
+#define CHIAVDF_PROFILE_H
+
+#include <cstdint>
+
+// This header centralizes optional profiling hooks used by `vdf.h` (driver) and
+// hot-loop primitives like NUDUPL (`nucomp.h`). Everything is no-op unless:
+// - `VDF_TEST` is enabled (VDF_MODE=1), and
+// - the caller sets `chiavdf_nudupl_profile_sink` (and optionally enables timing).
+
+struct chiavdf_nudupl_profile_stats {
+    // Outer-loop counts (from `repeated_square_nudupl`).
+    uint64_t iters = 0;
+    uint64_t reduce_calls = 0;
+    uint64_t reduce_skipped = 0;
+    uint64_t max_a_limbs = 0;
+
+    // Outer-loop timing (from `repeated_square_nudupl`).
+    uint64_t nudupl_form_time_ns = 0;
+    uint64_t reduce_time_ns = 0;
+
+    // Inner-loop breakdown (from `qfb_nudupl`).
+    uint64_t qfb_nudupl_calls = 0;
+    uint64_t b_negative = 0;
+    uint64_t branch_a_lt_L = 0;
+    uint64_t branch_a_ge_L = 0;
+
+    uint64_t gcdext_time_ns = 0;
+    uint64_t gcdext_s_eq_1 = 0;
+    uint64_t gcdext_s_ne_1 = 0;
+    uint64_t xgcd_partial_time_ns = 0;
+    uint64_t else_branch_time_ns = 0; // time spent in the a>=L branch overall
+};
+
+#if defined(VDF_TEST)
+inline thread_local chiavdf_nudupl_profile_stats* chiavdf_nudupl_profile_sink = nullptr;
+inline thread_local bool chiavdf_nudupl_profile_timing_enabled = false;
+#endif
+
+#endif // CHIAVDF_PROFILE_H
diff --git a/src/nucomp.h b/src/nucomp.h
index aef14a47..708beb43 100644
--- a/src/nucomp.h
+++ b/src/nucomp.h
@@ -38,6 +38,15 @@ limitations under the License.
 
 #include "xgcd_partial.c"
 
+#include "chiavdf_profile.h"
+
+#if defined(VDF_TEST)
+#include <chrono>
+#endif
+
+#include <cstdlib>
+#include <cstring>
+
 #define LOG2(X) (63 - __builtin_clzll((X)))
 //using namespace std;
 
@@ -214,13 +223,44 @@ void nucomp_form(form &a, form const& b, form const& c, integer const& D, intege
 
 void qfb_nudupl(qfb_t r, qfb_t f, mpz_t D, mpz_t L)
 {
-    mpz_t a1, c1, cb, k, s, t, u2, v2;
-
-    mpz_init(a1); mpz_init(c1);
-    mpz_init(cb);
-    mpz_init(k);
-    mpz_init(s);
-    mpz_init(t); mpz_init(u2); mpz_init(v2);
+#if defined(VDF_TEST)
+    chiavdf_nudupl_profile_stats* prof = chiavdf_nudupl_profile_sink;
+    const bool timing = (prof != nullptr) && chiavdf_nudupl_profile_timing_enabled;
+    if (prof != nullptr) {
+        ++prof->qfb_nudupl_calls;
+    }
+#else
+    chiavdf_nudupl_profile_stats* prof = nullptr;
+    const bool timing = false;
+#endif
+    // Performance note:
+    // This function is on the hot path for ARM `square_vdf` (NUDUPL). Avoid per-iteration
+    // `mpz_init/mpz_clear` churn by reusing a thread-local scratch context.
+    struct qfb_nudupl_ctx {
+        mpz_t a1, c1, cb, k, s, t, u2, v2;
+        mpz_t b_abs;
+        mpz_t m2, r1, r2, co1, co2, temp;  // only used in the "a1 >= L" branch
+
+        qfb_nudupl_ctx() {
+            mpz_inits(a1, c1, cb, k, s, t, u2, v2, b_abs, m2, r1, r2, co1, co2, temp, nullptr);
+        }
+        ~qfb_nudupl_ctx() {
+            mpz_clears(a1, c1, cb, k, s, t, u2, v2, b_abs, m2, r1, r2, co1, co2, temp, nullptr);
+        }
+        qfb_nudupl_ctx(const qfb_nudupl_ctx&) = delete;
+        qfb_nudupl_ctx& operator=(const qfb_nudupl_ctx&) = delete;
+    };
+    static thread_local qfb_nudupl_ctx ctx;
+
+    mpz_t& a1 = ctx.a1;
+    mpz_t& c1 = ctx.c1;
+    mpz_t& cb = ctx.cb;
+    mpz_t& k  = ctx.k;
+    mpz_t& s  = ctx.s;
+    mpz_t& t  = ctx.t;
+    mpz_t& u2 = ctx.u2;
+    mpz_t& v2 = ctx.v2;
+    mpz_t& b_abs = ctx.b_abs;
 
     /* nucomp calculation */
 
@@ -229,29 +269,66 @@ void qfb_nudupl(qfb_t r, qfb_t f, mpz_t D, mpz_t L)
     /* c1 = c */
     mpz_set(c1, f->c);
 
-    /* b < 0 */
-    if (mpz_sgn(f->b) < 0) {
-        mpz_neg(f->b, f->b);
-        /* s = gcd(abs(b), a); v2 = inv(b) (mod a) */
-        mpz_gcdext(s, v2, NULL, f->b, a1);
-        mpz_neg(f->b, f->b);
-        mpz_neg(v2, v2);
+    const int b_sgn = mpz_sgn(f->b);
+
+    if (b_sgn < 0) {
+#if defined(VDF_TEST)
+        if (prof != nullptr) ++prof->b_negative;
+        std::chrono::steady_clock::time_point t_g0;
+        if (timing) t_g0 = std::chrono::steady_clock::now();
+#endif
+        // Use |b| for gcdext/invert and apply sign afterwards; avoids mutating f->b in-place.
+        mpz_neg(b_abs, f->b);
+        /* s = gcd(|b|, a); v2 = coefficient for |b| (mod a) */
+        mpz_gcdext(s, v2, NULL, b_abs, a1);
+        mpz_neg(v2, v2); // convert coefficient for |b| into coefficient for b (negative)
+#if defined(VDF_TEST)
+        if (timing) {
+            const auto t_g1 = std::chrono::steady_clock::now();
+            prof->gcdext_time_ns += uint64_t(std::chrono::duration_cast<std::chrono::nanoseconds>(t_g1 - t_g0).count());
+        }
+#endif
     } else {
-        mpz_gcdext(s, v2, NULL, f->b, a1);
+#if defined(VDF_TEST)
+        std::chrono::steady_clock::time_point t_g0;
+        if (timing) t_g0 = std::chrono::steady_clock::now();
+#endif
+        mpz_set(b_abs, f->b);
+        mpz_gcdext(s, v2, NULL, b_abs, a1);
+#if defined(VDF_TEST)
+        if (timing) {
+            const auto t_g1 = std::chrono::steady_clock::now();
+            prof->gcdext_time_ns += uint64_t(std::chrono::duration_cast<std::chrono::nanoseconds>(t_g1 - t_g0).count());
+        }
+#endif
     }
 
     mpz_mul(k, v2, c1);
     mpz_neg(k, k);
 
-    if (mpz_cmp_ui(s, 1)) {
+    const bool s_is_1 = (mpz_cmp_ui(s, 1) == 0);
+#if defined(VDF_TEST)
+    if (prof != nullptr) {
+        if (s_is_1) ++prof->gcdext_s_eq_1;
+        else ++prof->gcdext_s_ne_1;
+    }
+#endif
+
+    if (!s_is_1) {
         mpz_fdiv_q(a1, a1, s);
         mpz_mul(c1, c1, s);
     }
 
     /* k = -(c*inv(b)) (mod a) */
-    mpz_fdiv_r(k, k, a1);
+    // `mpz_fdiv_r` implements a floor-remainder; for positive modulus `a1`, we can
+    // compute the (typically faster) trunc-remainder and fix up negative results.
+    mpz_tdiv_r(k, k, a1);
+    if (mpz_sgn(k) < 0) mpz_add(k, k, a1);
 
     if (mpz_cmp(a1, L) < 0) {
+#if defined(VDF_TEST)
+        if (prof != nullptr) ++prof->branch_a_lt_L;
+#endif
         mpz_mul(t, a1, k);
 
         mpz_mul(r->a, a1, a1);
@@ -265,17 +342,34 @@ void qfb_nudupl(qfb_t r, qfb_t f, mpz_t D, mpz_t L)
 
         mpz_fdiv_q(r->c, r->c, a1);
     } else {
-        mpz_t m2, r1, r2, co1, co2, temp;
-
-        mpz_init(m2); mpz_init(r1); mpz_init(r2);
-        mpz_init(co1); mpz_init(co2); mpz_init(temp);
+#if defined(VDF_TEST)
+        if (prof != nullptr) ++prof->branch_a_ge_L;
+        std::chrono::steady_clock::time_point t_else0;
+        if (timing) t_else0 = std::chrono::steady_clock::now();
+#endif
+        mpz_t& m2 = ctx.m2;
+        mpz_t& r1 = ctx.r1;
+        mpz_t& r2 = ctx.r2;
+        mpz_t& co1 = ctx.co1;
+        mpz_t& co2 = ctx.co2;
+        mpz_t& temp = ctx.temp;
 
         mpz_set(r2, a1);
         /* r1 = k */
         mpz_swap(r1, k);
 
         /* Satisfies co2*r1 - co1*r2 == +/- r2_orig */
+#if defined(VDF_TEST)
+        std::chrono::steady_clock::time_point t_x0;
+        if (timing) t_x0 = std::chrono::steady_clock::now();
+#endif
         mpz_xgcd_partial(co2, co1, r2, r1, L);
+#if defined(VDF_TEST)
+        if (timing) {
+            const auto t_x1 = std::chrono::steady_clock::now();
+            prof->xgcd_partial_time_ns += uint64_t(std::chrono::duration_cast<std::chrono::nanoseconds>(t_x1 - t_x0).count());
+        }
+#endif
 
         /* m2 = b * r1 */
         mpz_mul(m2, f->b, r1);
@@ -310,18 +404,15 @@ void qfb_nudupl(qfb_t r, qfb_t f, mpz_t D, mpz_t L)
             mpz_neg(r->a, r->a);
             mpz_neg(r->c, r->c);
         }
-
-        mpz_clear(m2); mpz_clear(r1); mpz_clear(r2);
-        mpz_clear(co1); mpz_clear(co2); mpz_clear(temp);
+#if defined(VDF_TEST)
+        if (timing) {
+            const auto t_else1 = std::chrono::steady_clock::now();
+            prof->else_branch_time_ns += uint64_t(std::chrono::duration_cast<std::chrono::nanoseconds>(t_else1 - t_else0).count());
+        }
+#endif
     }
 
     mpz_set(r->b, cb);
-
-    mpz_clear(cb);
-    mpz_clear(k);
-    mpz_clear(s);
-    mpz_clear(t); mpz_clear(u2); mpz_clear(v2);
-    mpz_clear(a1); mpz_clear(c1);
 }
 
 // a = b * b
diff --git a/src/nudupl_listener.h b/src/nudupl_listener.h
new file mode 100644
index 00000000..8aec2e19
--- /dev/null
+++ b/src/nudupl_listener.h
@@ -0,0 +1,19 @@
+#ifndef CHIAVDF_NUDUPL_LISTENER_H
+#define CHIAVDF_NUDUPL_LISTENER_H
+
+#include "include.h"
+
+// Notification types for `INUDUPLListener::OnIteration`.
+//
+// NL_SQUARESTATE: payload is `square_state_type*` (x86/x64 phased pipeline only).
+// NL_FORM: payload is `vdf_original::form*` (used by both the original slow loop and the ARM NUDUPL loop via a view).
+#define NL_SQUARESTATE 1
+#define NL_FORM 2
+
+class INUDUPLListener {
+public:
+    virtual ~INUDUPLListener() = default;
+    virtual void OnIteration(int type, void* data, uint64 iteration) = 0;
+};
+
+#endif // CHIAVDF_NUDUPL_LISTENER_H
diff --git a/src/parameters.h b/src/parameters.h
index c9d89ce6..85481e4e 100644
--- a/src/parameters.h
+++ b/src/parameters.h
@@ -38,7 +38,7 @@ bool enable_avx512_ifma=false;
     #define ARCH_X86
 #elif defined(__x86_64__) || defined(_M_X64)
     #define ARCH_X64
-#elif (defined(__arm__) && defined(__ARM_ARCH) && __ARM_ARCH >= 5) || (defined(_M_ARM) && _M_ARM >= 5) || defined(__ARM_FEATURE_CLZ) /* ARM (Architecture Version 5) */
+#elif defined(__aarch64__) || (defined(__arm__) && defined(__ARM_ARCH) && __ARM_ARCH >= 5) || (defined(_M_ARM) && _M_ARM >= 5) || defined(__ARM_FEATURE_CLZ) /* ARM (aarch64 or Architecture Version 5+) */
     #define ARCH_ARM
 #endif
 
diff --git a/src/prover_test.cpp b/src/prover_test.cpp
index 30d5e198..7b453cd3 100644
--- a/src/prover_test.cpp
+++ b/src/prover_test.cpp
@@ -2,11 +2,32 @@
 #include "verifier.h"
 #include "create_discriminant.h"
 #include <atomic>
+#include <cstdlib>
+#include <cstring>
+#include <thread>
+#include <vector>
+#include <chrono>
 
 int segments = 7;
 int thread_count = 3;
 std::atomic<bool> stop_signal{false};
 
+static bool env_truthy(const char* name)
+{
+    const char* v = std::getenv(name);
+    if (v == nullptr) return false;
+    if (v[0] == '\0') return false;
+    // Accept common "truthy" strings.
+    if (!std::strcmp(v, "1")) return true;
+    if (!std::strcmp(v, "true")) return true;
+    if (!std::strcmp(v, "TRUE")) return true;
+    if (!std::strcmp(v, "yes")) return true;
+    if (!std::strcmp(v, "YES")) return true;
+    if (!std::strcmp(v, "on")) return true;
+    if (!std::strcmp(v, "ON")) return true;
+    return false;
+}
+
 Proof CreateProof(integer D, ProverManager& pm, uint64_t iteration) {
     Proof proof = pm.Prove(iteration);
     if (!stop_signal) {
@@ -60,10 +81,30 @@ int main() {
     ProverManager pm(D, (FastAlgorithmCallback*)weso, fast_storage, segments, thread_count);
     pm.start();
     std::vector<std::thread> threads;
-    for (int i = 0; i <= 30; i++) {
-        threads.emplace_back(CreateProof, D, std::ref(pm), (1 << 21) * i + 60000);
+
+    // This binary is used by CI as a correctness test. Historically it also served as a 5-minute
+    // soak/stress test; that dominates the wall-clock runtime of the "all tests" run.
+    //
+    // Default behavior: run the historical long/soak test.
+    // Fast/CI-friendly mode: set `CHIAVDF_PROVER_TEST_FAST=1` to run just a few proofs and exit.
+    const bool fast_mode = env_truthy("CHIAVDF_PROVER_TEST_FAST");
+    const bool is_ci = (std::getenv("CI") != nullptr) || (std::getenv("GITHUB_ACTIONS") != nullptr);
+
+    if (!fast_mode) {
+        for (int i = 0; i <= 30; i++) {
+            threads.emplace_back(CreateProof, D, std::ref(pm), (1ULL << 21) * uint64_t(i) + 60000);
+        }
+        std::this_thread::sleep_for(std::chrono::seconds(300));
+    } else {
+        // Keep iterations small enough to complete quickly on CI runners.
+        const int max_i = is_ci ? 3 : 6;
+        for (int i = 0; i < max_i; i++) {
+            threads.emplace_back(CreateProof, D, std::ref(pm), (1ULL << 18) * uint64_t(i) + 60000);
+        }
+        for (auto& t : threads) t.join();
+        threads.clear();
     }
-    std::this_thread::sleep_for (std::chrono::seconds(300));
+
     stop_signal = true;
     std::cout << "Stopping everything.\n";
     pm.stop();
diff --git a/src/vdf.h b/src/vdf.h
index 9ab4aef4..e83c323a 100644
--- a/src/vdf.h
+++ b/src/vdf.h
@@ -3,15 +3,16 @@
 
 #include "include.h"
 
+#if defined(ARCH_X86) || defined(ARCH_X64)
 #include <x86intrin.h>
+#endif
 
 #include "parameters.h"
 
 #include "bit_manipulation.h"
 #include "double_utility.h"
 #include "integer.h"
-
-#include "asm_main.h"
+#include "alloc.hpp"
 
 #include "vdf_original.h"
 
@@ -21,21 +22,30 @@
 #include "gpu_integer.h"
 #include "gpu_integer_divide.h"
 
+#include "nucomp.h"
+
+#include "nudupl_listener.h"
+
+#if defined(ARCH_X86) || defined(ARCH_X64)
+#include "asm_main.h"
+
 #include "gcd_base_continued_fractions.h"
 //#include "gcd_base_divide_table.h"
 #include "gcd_128.h"
 #include "gcd_unsigned.h"
 
-#include "gpu_integer_gcd.h"
-
 #include "asm_types.h"
 
 #include "threading.h"
 #include "avx512_integer.h"
-#include "nucomp.h"
 #include "vdf_fast.h"
+#endif
+
+#include "gpu_integer_gcd.h"
 
+#if defined(ARCH_X86) || defined(ARCH_X64)
 #include "vdf_test.h"
+#endif
 #include <map>
 #include <algorithm>
 
@@ -99,6 +109,51 @@ void repeated_square_original(vdf_original &vdfo, form& f, const integer& D, con
     mpz_set(f.c.impl, f_res->c);
 }
 
+// Slow squaring helper using the C++ NUDUPL implementation (`qfb_nudupl`) plus Pulmark reduction.
+//
+// This is substantially faster than `vdf_original::square()` on some platforms (notably ARM).
+// We intentionally keep the *corruption* correction path on the independent `vdf_original`
+// implementation.
+static inline void repeated_square_nudupl(
+    form& f,
+    integer& D,
+    integer& L,
+    uint64 base,
+    uint64 iterations,
+    WesolowskiCallback* weso,
+    INUDUPLListener* nuduplListener
+) {
+    vdf_original::form f_view;
+    // Defensive fallback: if `weso` is null, use a Pulmark reducer.
+    // Construct it once per call (it does heap work) rather than per reduction.
+    std::optional<PulmarkReducer> fallback_reducer;
+    if (weso == nullptr) {
+        fallback_reducer.emplace();
+    }
+    for (uint64_t i = 0; i < iterations; i++) {
+        nudupl_form(f, f, D, L);
+
+        // Reduce only when `a` grows beyond a small limb threshold. Reducing every iteration
+        // can be slower than letting NUDUPL run a bit "wide".
+        if (__GMP_ABS(f.a.impl->_mp_size) > 8) {
+            if (weso) {
+                weso->reduce(f);
+            } else {
+                fallback_reducer->reduce(f);
+            }
+        }
+
+        if (nuduplListener != nullptr) {
+            // Present the C++ `form` as a `vdf_original::form` view so existing callbacks can
+            // consume it without any new type tags.
+            f_view.a[0] = f.a.impl[0];
+            f_view.b[0] = f.b.impl[0];
+            f_view.c[0] = f.c.impl[0];
+            nuduplListener->OnIteration(NL_FORM, &f_view, base + i);
+        }
+    }
+}
+
 // thread safe; but it is only called from the main thread
 void repeated_square(uint64_t iterations, form f, const integer& D, const integer& L,
     WesolowskiCallback* weso, FastStorage* fast_storage, std::atomic<bool>& stopped)
@@ -123,8 +178,10 @@ void repeated_square(uint64_t iterations, form f, const integer& D, const intege
                 f_copy=f;
                 c_checkpoint_interval=1;
 
+                #if defined(ARCH_X86) || defined(ARCH_X64)
                 f_copy_3=f;
                 f_copy_3_valid=square_fast_impl(f_copy_3, D, L, num_iterations);
+                #endif
             }
         #endif
 
@@ -135,11 +192,19 @@ void repeated_square(uint64_t iterations, form f, const integer& D, const intege
             repeated_square_original(*weso->vdfo, f, D, L, 100); //randomize the a and b values
         #endif
 
-        // This works single threaded
+        uint64 actual_iterations = 0;
+#if defined(ARCH_X86) || defined(ARCH_X64)
+        // x86/x64: use the phased pipeline.
         square_state_type square_state;
-        square_state.pairindex=0;
-
-        uint64 actual_iterations=repeated_square_fast(square_state, f, D, L, num_iterations, batch_size, weso);
+        square_state.pairindex = 0;
+        actual_iterations = repeated_square_fast(square_state, f, D, L, num_iterations, batch_size, weso);
+#else
+        // Non-x86: use the C++ NUDUPL path (faster and lower maintenance than the phased pipeline).
+        integer& D_nc = const_cast<integer&>(D);
+        integer& L_nc = const_cast<integer&>(L);
+        repeated_square_nudupl(f, D_nc, L_nc, num_iterations, batch_size, weso, weso);
+        actual_iterations = batch_size;
+#endif
 
         #ifdef VDF_TEST
             ++num_calls_fast;
diff --git a/src/vdf_bench.cpp b/src/vdf_bench.cpp
index fdd121e0..aab8feeb 100644
--- a/src/vdf_bench.cpp
+++ b/src/vdf_bench.cpp
@@ -2,16 +2,19 @@
 #include "bit_manipulation.h"
 #include "double_utility.h"
 #include "parameters.h"
-#include "asm_main.h"
 #include "integer.h"
+#include "alloc.hpp"
 #include "vdf_new.h"
 #include "nucomp.h"
 #include "picosha2.h"
 #include "proof_common.h"
 
+#if defined(ARCH_X86) || defined(ARCH_X64)
+#include "asm_main.h"
 #include "threading.h"
 #include "avx512_integer.h"
 #include "vdf_fast.h"
+#endif
 #include "create_discriminant.h"
 
 #include <cstdlib>
@@ -46,6 +49,7 @@ int main(int argc, char **argv)
     auto t1 = std::chrono::high_resolution_clock::now();
     if (!strcmp(argv[1], "square_asm")) {
         is_asm = true;
+#if defined(ARCH_X86) || defined(ARCH_X64)
         for (i = 0; i < iters; ) {
             square_state_type sq_state;
             sq_state.pairindex = 0;
@@ -64,6 +68,17 @@ int main(int argc, char **argv)
                 i += done;
             }
         }
+#else
+        // On non-x86 architectures we don't build the phased/asm pipeline.
+        // Keep script compatibility by treating `square_asm` as a NUDUPL benchmark.
+        for (i = 0; i < iters; i++) {
+            nudupl_form(y, y, D, L);
+            if (__GMP_ABS(y.a.impl->_mp_size) > 8) {
+                reducer.reduce(y);
+            }
+        }
+        is_asm = false;
+#endif
     } else if (!strcmp(argv[1], "square")) {
         for (i = 0; i < iters; i++) {
             nudupl_form(y, y, D, L);
diff --git a/src/vdf_fast.h b/src/vdf_fast.h
index 34d47586..571a0d04 100644
--- a/src/vdf_fast.h
+++ b/src/vdf_fast.h
@@ -994,13 +994,7 @@ struct square_state_type {
     }*/
 };
 
-#define NL_SQUARESTATE 1
-#define NL_FORM 2
-
-class INUDUPLListener{
-public:
-    virtual void OnIteration(int type, void *data, uint64 iteration)=0;
-};
+#include "nudupl_listener.h"
 
 //this should never have an infinite loop
 //the gcd loops all have maximum counters after which they'll error out, and the thread_state loops also have a maximum spin counter
diff --git a/src/xgcd_partial.c b/src/xgcd_partial.c
index 536e6c19..7e7adf49 100644
--- a/src/xgcd_partial.c
+++ b/src/xgcd_partial.c
@@ -24,32 +24,133 @@
 #define _XGCD_PARTIAL
 
 #include <gmp.h>
+#include <stdint.h>
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+
+// MSVC doesn't provide `__builtin_clz*`; use bit-scan intrinsics instead.
+static inline int chiavdf_clz_u32(unsigned long x)
+{
+   unsigned long idx = 0;
+   _BitScanReverse(&idx, x); // x != 0
+   return 31 - (int)idx;
+}
+
+static inline int chiavdf_clz_u64(unsigned __int64 x)
+{
+   unsigned long idx = 0;
+#if defined(_M_X64) || defined(_M_ARM64)
+   _BitScanReverse64(&idx, x); // x != 0
+   return 63 - (int)idx;
+#else
+   // 32-bit targets: synthesize using two 32-bit scans.
+   const unsigned long hi = (unsigned long)(x >> 32);
+   if (hi != 0) {
+      _BitScanReverse(&idx, hi);
+      return 31 - (int)idx;
+   }
+   const unsigned long lo = (unsigned long)(x & 0xffffffffu);
+   _BitScanReverse(&idx, lo);
+   return 63 - (int)idx;
+#endif
+}
+#endif
+
+// Fast helpers (avoid mpz temporaries in tight loops).
+static inline mp_limb_signed_t chiavdf_mpz_bitlen_nonneg(const mpz_t x)
+{
+   // Match mpz_sizeinbase(x, 2) for x >= 0:
+   // - returns 1 for x == 0
+   // - otherwise returns exact bit length
+   const size_t n = mpz_size(x); // number of limbs (abs)
+   if (n == 0) return 1;
+   const mp_limb_t top = mpz_getlimbn(x, (mp_size_t)(n - 1));
+   // top is non-zero when n != 0, but be defensive.
+   if (top == 0) return 1;
+#if GMP_LIMB_BITS == 64
+#if defined(_MSC_VER)
+   const int lead = chiavdf_clz_u64((unsigned __int64)top);
+#else
+   const int lead = __builtin_clzll((unsigned long long)top);
+#endif
+#elif GMP_LIMB_BITS == 32
+#if defined(_MSC_VER)
+   const int lead = chiavdf_clz_u32((unsigned long)top);
+#else
+   const int lead = __builtin_clz((unsigned int)top);
+#endif
+#else
+   // Fallback (unlikely): conservative loop.
+   int lead = 0;
+   for (int b = GMP_LIMB_BITS - 1; b >= 0; --b) {
+      if ((top >> b) & 1) break;
+      ++lead;
+   }
+#endif
+   const mp_limb_signed_t top_bits = (mp_limb_signed_t)(GMP_LIMB_BITS - lead);
+   return (mp_limb_signed_t)((n - 1) * (size_t)GMP_LIMB_BITS) + top_bits;
+}
+
+static inline mp_limb_signed_t chiavdf_mpz_extract_uword_from_shift_nonneg(const mpz_t x, mp_limb_signed_t shift_bits)
+{
+   // Return the low word of (x >> shift_bits), assuming x >= 0 and shift_bits >= 0.
+   // This is what `mpz_get_ui(tmp)` would yield after `mpz_tdiv_q_2exp(tmp, x, shift_bits)`,
+   // but without allocating or touching an mpz temp.
+   if (shift_bits <= 0) {
+      // limb 0 is enough for our use here.
+      return (mp_limb_signed_t)mpz_getlimbn(x, 0);
+   }
+   const mp_limb_signed_t limb_bits = (mp_limb_signed_t)GMP_LIMB_BITS;
+   const mp_limb_signed_t limb_idx = shift_bits / limb_bits;
+   const mp_limb_signed_t off = shift_bits - limb_idx * limb_bits;
+   mp_limb_t lo = mpz_getlimbn(x, (mp_size_t)limb_idx);
+   if (off == 0) return (mp_limb_signed_t)lo;
+   mp_limb_t hi = mpz_getlimbn(x, (mp_size_t)(limb_idx + 1));
+   lo >>= (unsigned)off;
+   hi <<= (unsigned)(limb_bits - off);
+   return (mp_limb_signed_t)(lo | hi);
+}
 
 void mpz_xgcd_partial(mpz_t co2, mpz_t co1,
                                     mpz_t r2, mpz_t r1, const mpz_t L)
 {
-   mpz_t q, r;
+   // Hot-path note:
+   // This function can run in the inner loop of NUDUPL; avoid per-call
+   // `mpz_init/mpz_clear` by using thread-local temporaries.
+   //
+   // Important for ASAN/LSan: these temporaries must be freed at thread-exit, otherwise
+   // LeakSanitizer will report per-thread GMP allocations as leaked.
+   struct chiavdf_xgcd_partial_tls {
+      mpz_t q;
+      mpz_t r;
+
+      chiavdf_xgcd_partial_tls() { mpz_init(q); mpz_init(r); }
+      ~chiavdf_xgcd_partial_tls() { mpz_clear(q); mpz_clear(r); }
+
+      chiavdf_xgcd_partial_tls(const chiavdf_xgcd_partial_tls&) = delete;
+      chiavdf_xgcd_partial_tls& operator=(const chiavdf_xgcd_partial_tls&) = delete;
+   };
+   static thread_local chiavdf_xgcd_partial_tls tls;
+   mpz_ptr q = tls.q;
+   mpz_ptr r = tls.r;
    mp_limb_signed_t aa2, aa1, bb2, bb1, rr1, rr2, qq, bb, t1, t2, t3, i;
    mp_limb_signed_t bits, bits1, bits2;
 
-   mpz_init(q); mpz_init(r);
-
    mpz_set_ui(co2, 0);
    mpz_set_si(co1, -1);
 
    while (mpz_cmp_ui(r1, 0) && mpz_cmp(r1, L) > 0)
    {
-      bits2 = mpz_sizeinbase(r2, 2);
-      bits1 = mpz_sizeinbase(r1, 2);
+      // r2/r1 are expected to be nonnegative here (algorithm maintains sign after each step).
+      bits2 = chiavdf_mpz_bitlen_nonneg(r2);
+      bits1 = chiavdf_mpz_bitlen_nonneg(r1);
       bits = __GMP_MAX(bits2, bits1) - GMP_LIMB_BITS + 1;
       if (bits < 0) bits = 0;
 
-      mpz_tdiv_q_2exp(r, r2, bits);
-      rr2 = mpz_get_ui(r);
-      mpz_tdiv_q_2exp(r, r1, bits);
-      rr1 = mpz_get_ui(r);
-      mpz_tdiv_q_2exp(r, L, bits);
-      bb = mpz_get_ui(r);
+      rr2 = chiavdf_mpz_extract_uword_from_shift_nonneg(r2, bits);
+      rr1 = chiavdf_mpz_extract_uword_from_shift_nonneg(r1, bits);
+      bb  = chiavdf_mpz_extract_uword_from_shift_nonneg(L,  bits);
 
       aa2 = 0; aa1 = 1;
       bb2 = 1; bb1 = 0;
@@ -77,7 +178,9 @@ void mpz_xgcd_partial(mpz_t co2, mpz_t co1,
 
       if (i == 0)
       {
-         mpz_fdiv_qr(q, r2, r2, r1);
+         // r2,r1 are nonnegative here; trunc and floor division are equivalent, and
+         // `mpz_tdiv_qr` avoids extra sign-handling overhead.
+         mpz_tdiv_qr(q, r2, r2, r1);
          mpz_swap(r2, r1);
 
          mpz_submul(co2, co1, q);
@@ -118,7 +221,5 @@ void mpz_xgcd_partial(mpz_t co2, mpz_t co1,
       mpz_neg(co2, co2); mpz_neg(co1, co1);
       mpz_neg(r2, r2);
    }
-
-   mpz_clear(q); mpz_clear(r);
 }
 #endif /* _XGCD_PARTIAL */