diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 6772e371..9a258d6e 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -20,21 +20,43 @@ permissions: jobs: fuzz_targets: - name: Run fuzzers + name: Run fuzzers (${{ matrix.target }}) runs-on: ubuntu-latest env: CARGO_PROFILE_RELEASE_LTO: false + strategy: + fail-fast: false + matrix: + target: + - create_discriminant + - prove + - verify + - verify_n_wesolowski steps: - uses: actions/checkout@v6 - uses: dtolnay/rust-toolchain@nightly + - name: Cache cargo registry + build artifacts + uses: actions/cache@v4 + with: + path: | + ~/.cargo/bin + ~/.cargo/registry + ~/.cargo/git + target + rust_bindings/fuzz/corpus + key: ${{ runner.os }}-rust-fuzz-${{ hashFiles('Cargo.lock') }} + - name: Install cargo-fuzz - run: cargo +nightly install cargo-fuzz + run: | + if ! command -v cargo-fuzz >/dev/null 2>&1; then + cargo +nightly install cargo-fuzz --locked + fi - - name: Cargo fuzz + - name: Cargo fuzz (${{ matrix.target }}) run: | cd rust_bindings - cargo fuzz list | xargs -I "%" sh -c "cargo +nightly fuzz run % -- -max_total_time=600 || exit 255" + cargo +nightly fuzz run ${{ matrix.target }} -- -max_total_time=600 lint: name: Lint @@ -110,7 +132,10 @@ jobs: - name: Install libclang-dev on Linux if: matrix.os.name == 'Ubuntu' - run: sudo apt-get install libclang-dev -y + run: | + # Avoid transient 404s from stale apt indices / mirror lag. + sudo apt-get update -y -o Acquire::Retries=3 + sudo apt-get install libclang-dev -y - name: Set up Rust uses: dtolnay/rust-toolchain@stable diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index bb8ae50d..729e19ee 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -17,7 +17,7 @@ jobs: strategy: fail-fast: false matrix: - os: [macos-13-intel, ubuntu-latest] + os: [macos-13-intel, macos-13-arm64, ubuntu-latest] config: [optimized=1, TSAN=1, ASAN=1] steps: @@ -34,6 +34,8 @@ jobs: - name: Build vdf-client on Ubuntu if: startsWith(matrix.os, 'ubuntu') run: | + # Avoid transient 404s from stale apt indices / mirror lag. + sudo apt-get update -y -o Acquire::Retries=3 sudo apt-get install libgmp-dev libboost-python-dev libpython3-dev libboost-system-dev build-essential -y cd src make ${{ matrix.config }} -f Makefile.vdf-client @@ -54,7 +56,11 @@ jobs: echo "Running 2weso_test" ./2weso_test echo "Running prover_test" - ./prover_test + if [[ "${{ matrix.os }}" == ubuntu* ]]; then + ./prover_test + else + CHIAVDF_PROVER_TEST_FAST=1 ./prover_test + fi - name: Test vdf-client if: matrix.config != 'optimized=1' @@ -73,7 +79,11 @@ jobs: run: | cd src echo "Running prover_test" - ./prover_test + if [[ "${{ matrix.os }}" == ubuntu* ]]; then + ./prover_test + else + CHIAVDF_PROVER_TEST_FAST=1 ./prover_test + fi - name: Benchmark vdf-client if: matrix.config == 'optimized=1' diff --git a/README.md b/README.md index a374a0cd..537cd261 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,8 @@ MacOS and Linux, chiavdf can be used to compile vdf_client and vdf_bench. vdf_client is the core VDF process that completes the Proof of Time submitted to it by the Timelord. The repo also includes a benchmarking tool to get a sense of the iterations per second of a given CPU called vdf_bench. Try -`./vdf_bench square_asm 250000` for an ips estimate. +`./vdf_bench square_asm 250000` for an ips estimate on x86/x64 (phased/asm +pipeline). On non-x86 architectures, use `./vdf_bench square 250000` (NUDUPL). To build vdf_client set the environment variable BUILD_VDF_CLIENT to "Y". `export BUILD_VDF_CLIENT=Y`. @@ -60,6 +61,16 @@ If you're running a timelord, the following tests are available, depending of wh Those tests will simulate the vdf_client and verify for correctness the produced proofs. +Note: `./prover_test` defaults to a long soak/stress run. Set +`CHIAVDF_PROVER_TEST_FAST=1` to run a short, CI-friendly correctness check. + +## Fuzzing + +Fuzz targets live under `rust_bindings/fuzz`. The `prove` target includes an +iteration cap to avoid out-of-memory conditions in CI. If you want deeper +iteration coverage, raise the cap in `rust_bindings/fuzz/fuzz_targets/prove.rs` +after validating memory usage and exec/s on your runner. + ## Contributing and workflow Contributions are welcome and more details are available in chia-blockchain's diff --git a/pyproject.toml b/pyproject.toml index 1bfd3a4a..55a04341 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,10 @@ before-build = "python -m pip install --upgrade pip" [tool.cibuildwheel.macos] build-verbosity = 0 -before-all = "brew install gmp boost cmake" +before-all = """ +brew --prefix --installed gmp >/dev/null 2>&1 || brew install gmp +brew install boost cmake +""" before-build = "python -m pip install --upgrade pip" environment = {MACOSX_DEPLOYMENT_TARGET="13", SYSTEM_VERSION_COMPAT=0, BUILD_VDF_CLIENT="N"} diff --git a/rust_bindings/fuzz/fuzz_targets/prove.rs b/rust_bindings/fuzz/fuzz_targets/prove.rs index 64a962cd..cea282e4 100644 --- a/rust_bindings/fuzz/fuzz_targets/prove.rs +++ b/rust_bindings/fuzz/fuzz_targets/prove.rs @@ -1,9 +1,32 @@ #![no_main] use chiavdf::prove; -use libfuzzer_sys::fuzz_target; +use libfuzzer_sys::{fuzz_target, Corpus}; -fuzz_target!(|data: ([u8; 32], [u8; 100], u16)| { +// Fuzzing `prove()` with unbounded `iters` can explode memory usage and runtime. +// The cost of the underlying VDF prover is at least linear in `iters`, and in +// practice can become superlinear due to internal allocation patterns. We have +// observed OOM (exit 137) in CI when `iters` is allowed to reach the full `u16` +// range, so we cap it to keep fuzzing stable and high-throughput. +// +// Why 4096: +// - Large enough to exercise multiple loop iterations and proof paths beyond +// "toy" counts, preserving meaningful coverage. +// - Small enough to keep inputs fast and avoid pathological allocations across +// typical CI memory limits. +// - Selected empirically as a conservative upper bound given prior OOMs; it can +// be raised later if measurements show steady memory and acceptable exec/s. +// +// If you want deeper iteration coverage, consider a separate stress target or +// a time/iteration-budgeted harness rather than unbounded fuzz inputs. +const MAX_ITERS: u64 = 4096; + +fuzz_target!(|data: ([u8; 32], [u8; 100], u16)| -> Corpus { let (genesis_challenge, element, iters) = data; - prove(&genesis_challenge, &element, 1024, iters as u64); + let iters = iters as u64; + if iters > MAX_ITERS { + return Corpus::Reject; + } + prove(&genesis_challenge, &element, 1024, iters); + Corpus::Keep }); diff --git a/setup.py b/setup.py index b9e1545c..595f8bcb 100644 --- a/setup.py +++ b/setup.py @@ -3,6 +3,7 @@ import shutil import subprocess import sys +from pathlib import Path from setuptools import Command, Extension, setup from setuptools.command.build import build @@ -134,6 +135,11 @@ def build_extension(self, ext): build.sub_commands.append(("build_hook", lambda x: True)) # type: ignore install.sub_commands.append(("install_hook", lambda x: True)) +# Wheel metadata generation on Windows can run with a non-UTF8 default encoding. +# Read `README.md` explicitly as UTF-8 so `long_description` is robust across runners. +_readme_path = Path(__file__).resolve().parent / "README.md" +_long_description = _readme_path.read_text(encoding="utf-8") + setup( name="chiavdf", author="Florin Chirica", @@ -141,7 +147,7 @@ def build_extension(self, ext): description="Chia vdf verification (wraps C++)", license="Apache-2.0", python_requires=">=3.9", - long_description=open("README.md").read(), + long_description=_long_description, long_description_content_type="text/markdown", url="https://github.com/Chia-Network/chiavdf", ext_modules=[CMakeExtension("chiavdf", "src")], diff --git a/src/Makefile.vdf-client b/src/Makefile.vdf-client index ed41963f..9b0aa5a5 100644 --- a/src/Makefile.vdf-client +++ b/src/Makefile.vdf-client @@ -1,16 +1,32 @@ UNAME := $(shell uname) +ARCH := $(shell uname -m) ifneq (,$(findstring clang, $(shell $(CXX) --version))) NOPIE = -fno-PIE else NOPIE = -no-pie endif +# macOS arm64 ignores -no_pie and warns; omit to avoid deprecation warnings +ifeq ($(UNAME),Darwin) +ifneq ($(filter $(ARCH),arm64),) +NOPIE = +endif +endif LDFLAGS += -flto $(NOPIE) -g LDLIBS += -lgmpxx -lgmp -pthread CXXFLAGS += -flto -std=c++1z -D VDF_MODE=0 -D FAST_MACHINE=1 -pthread $(NOPIE) -fvisibility=hidden ifeq ($(UNAME),Darwin) CXXFLAGS += -D CHIAOSX=1 +# Homebrew (common on macOS) installs boost/gmp to /opt/homebrew or /usr/local +ifneq ($(wildcard /opt/homebrew/include/boost/asio.hpp),) +CXXFLAGS += -I/opt/homebrew/include +LDFLAGS += -L/opt/homebrew/lib +endif +ifneq ($(wildcard /usr/local/include/boost/asio.hpp),) +CXXFLAGS += -I/usr/local/include +LDFLAGS += -L/usr/local/lib +endif endif OPT_CFLAGS = -O3 -g @@ -27,13 +43,20 @@ endif .PHONY: all clean +# Only x86_64 builds use the x86 asm objects +ifeq ($(ARCH),x86_64) +ASM_OBJS = asm_compiled.o avx2_asm_compiled.o avx512_asm_compiled.o +else +ASM_OBJS = +endif + BINS = vdf_client prover_test 1weso_test 2weso_test vdf_bench all: $(BINS) clean: rm -f *.o hw/*.o $(BINS) compile_asm emu_hw_test hw_test hw_vdf_client emu_hw_vdf_client -$(BINS) avx512_test: %: %.o lzcnt.o asm_compiled.o avx2_asm_compiled.o avx512_asm_compiled.o +$(BINS) avx512_test: %: %.o lzcnt.o $(ASM_OBJS) $(CXX) $(LDFLAGS) -o $@ $^ $(LDLIBS) $(addsuffix .o,$(BINS)) avx512_test.o: CXXFLAGS += $(OPT_CFLAGS) diff --git a/src/avx512_integer.h b/src/avx512_integer.h index 489408f8..9d683e7a 100644 --- a/src/avx512_integer.h +++ b/src/avx512_integer.h @@ -123,6 +123,7 @@ void mpz_impl_set_mul( const mpz& a, const mpz& b ) { +#if defined(ARCH_X86) || defined(ARCH_X64) if (enable_avx512_ifma) { typename avx512_integer_for_size::i a_avx512; typename avx512_integer_for_size::i b_avx512; @@ -132,7 +133,9 @@ void mpz_impl_set_mul( b_avx512=b; out_avx512.set_mul(a_avx512, b_avx512); out_avx512.assign(out); - } else { + } else +#endif + { mpz_mul(out._(), a._(), b._()); } } diff --git a/src/callback.h b/src/callback.h index 92207d1b..a10cb054 100644 --- a/src/callback.h +++ b/src/callback.h @@ -2,6 +2,7 @@ #define CALLBACK_H #include "util.h" +#include "nudupl_listener.h" // Applies to n-weso. const int kWindowSize = 20; @@ -32,6 +33,7 @@ class WesolowskiCallback :public INUDUPLListener { switch(type) { case NL_SQUARESTATE: { +#if defined(ARCH_X86) || defined(ARCH_X64) //cout << "NL_SQUARESTATE" << endl; uint64 res; @@ -39,6 +41,11 @@ class WesolowskiCallback :public INUDUPLListener { if(!square_state->assign(mulf->a, mulf->b, mulf->c, res)) cout << "square_state->assign failed" << endl; +#else + // Phased pipeline is x86/x64-only. + (void)data; + cout << "NL_SQUARESTATE unsupported on this architecture" << endl; +#endif break; } case NL_FORM: diff --git a/src/chiavdf_profile.h b/src/chiavdf_profile.h new file mode 100644 index 00000000..f1c4c1d7 --- /dev/null +++ b/src/chiavdf_profile.h @@ -0,0 +1,40 @@ +#ifndef CHIAVDF_PROFILE_H +#define CHIAVDF_PROFILE_H + +#include + +// This header centralizes optional profiling hooks used by `vdf.h` (driver) and +// hot-loop primitives like NUDUPL (`nucomp.h`). Everything is no-op unless: +// - `VDF_TEST` is enabled (VDF_MODE=1), and +// - the caller sets `chiavdf_nudupl_profile_sink` (and optionally enables timing). + +struct chiavdf_nudupl_profile_stats { + // Outer-loop counts (from `repeated_square_nudupl`). + uint64_t iters = 0; + uint64_t reduce_calls = 0; + uint64_t reduce_skipped = 0; + uint64_t max_a_limbs = 0; + + // Outer-loop timing (from `repeated_square_nudupl`). + uint64_t nudupl_form_time_ns = 0; + uint64_t reduce_time_ns = 0; + + // Inner-loop breakdown (from `qfb_nudupl`). + uint64_t qfb_nudupl_calls = 0; + uint64_t b_negative = 0; + uint64_t branch_a_lt_L = 0; + uint64_t branch_a_ge_L = 0; + + uint64_t gcdext_time_ns = 0; + uint64_t gcdext_s_eq_1 = 0; + uint64_t gcdext_s_ne_1 = 0; + uint64_t xgcd_partial_time_ns = 0; + uint64_t else_branch_time_ns = 0; // time spent in the a>=L branch overall +}; + +#if defined(VDF_TEST) +inline thread_local chiavdf_nudupl_profile_stats* chiavdf_nudupl_profile_sink = nullptr; +inline thread_local bool chiavdf_nudupl_profile_timing_enabled = false; +#endif + +#endif // CHIAVDF_PROFILE_H diff --git a/src/nucomp.h b/src/nucomp.h index aef14a47..708beb43 100644 --- a/src/nucomp.h +++ b/src/nucomp.h @@ -38,6 +38,15 @@ limitations under the License. #include "xgcd_partial.c" +#include "chiavdf_profile.h" + +#if defined(VDF_TEST) +#include +#endif + +#include +#include + #define LOG2(X) (63 - __builtin_clzll((X))) //using namespace std; @@ -214,13 +223,44 @@ void nucomp_form(form &a, form const& b, form const& c, integer const& D, intege void qfb_nudupl(qfb_t r, qfb_t f, mpz_t D, mpz_t L) { - mpz_t a1, c1, cb, k, s, t, u2, v2; - - mpz_init(a1); mpz_init(c1); - mpz_init(cb); - mpz_init(k); - mpz_init(s); - mpz_init(t); mpz_init(u2); mpz_init(v2); +#if defined(VDF_TEST) + chiavdf_nudupl_profile_stats* prof = chiavdf_nudupl_profile_sink; + const bool timing = (prof != nullptr) && chiavdf_nudupl_profile_timing_enabled; + if (prof != nullptr) { + ++prof->qfb_nudupl_calls; + } +#else + chiavdf_nudupl_profile_stats* prof = nullptr; + const bool timing = false; +#endif + // Performance note: + // This function is on the hot path for ARM `square_vdf` (NUDUPL). Avoid per-iteration + // `mpz_init/mpz_clear` churn by reusing a thread-local scratch context. + struct qfb_nudupl_ctx { + mpz_t a1, c1, cb, k, s, t, u2, v2; + mpz_t b_abs; + mpz_t m2, r1, r2, co1, co2, temp; // only used in the "a1 >= L" branch + + qfb_nudupl_ctx() { + mpz_inits(a1, c1, cb, k, s, t, u2, v2, b_abs, m2, r1, r2, co1, co2, temp, nullptr); + } + ~qfb_nudupl_ctx() { + mpz_clears(a1, c1, cb, k, s, t, u2, v2, b_abs, m2, r1, r2, co1, co2, temp, nullptr); + } + qfb_nudupl_ctx(const qfb_nudupl_ctx&) = delete; + qfb_nudupl_ctx& operator=(const qfb_nudupl_ctx&) = delete; + }; + static thread_local qfb_nudupl_ctx ctx; + + mpz_t& a1 = ctx.a1; + mpz_t& c1 = ctx.c1; + mpz_t& cb = ctx.cb; + mpz_t& k = ctx.k; + mpz_t& s = ctx.s; + mpz_t& t = ctx.t; + mpz_t& u2 = ctx.u2; + mpz_t& v2 = ctx.v2; + mpz_t& b_abs = ctx.b_abs; /* nucomp calculation */ @@ -229,29 +269,66 @@ void qfb_nudupl(qfb_t r, qfb_t f, mpz_t D, mpz_t L) /* c1 = c */ mpz_set(c1, f->c); - /* b < 0 */ - if (mpz_sgn(f->b) < 0) { - mpz_neg(f->b, f->b); - /* s = gcd(abs(b), a); v2 = inv(b) (mod a) */ - mpz_gcdext(s, v2, NULL, f->b, a1); - mpz_neg(f->b, f->b); - mpz_neg(v2, v2); + const int b_sgn = mpz_sgn(f->b); + + if (b_sgn < 0) { +#if defined(VDF_TEST) + if (prof != nullptr) ++prof->b_negative; + std::chrono::steady_clock::time_point t_g0; + if (timing) t_g0 = std::chrono::steady_clock::now(); +#endif + // Use |b| for gcdext/invert and apply sign afterwards; avoids mutating f->b in-place. + mpz_neg(b_abs, f->b); + /* s = gcd(|b|, a); v2 = coefficient for |b| (mod a) */ + mpz_gcdext(s, v2, NULL, b_abs, a1); + mpz_neg(v2, v2); // convert coefficient for |b| into coefficient for b (negative) +#if defined(VDF_TEST) + if (timing) { + const auto t_g1 = std::chrono::steady_clock::now(); + prof->gcdext_time_ns += uint64_t(std::chrono::duration_cast(t_g1 - t_g0).count()); + } +#endif } else { - mpz_gcdext(s, v2, NULL, f->b, a1); +#if defined(VDF_TEST) + std::chrono::steady_clock::time_point t_g0; + if (timing) t_g0 = std::chrono::steady_clock::now(); +#endif + mpz_set(b_abs, f->b); + mpz_gcdext(s, v2, NULL, b_abs, a1); +#if defined(VDF_TEST) + if (timing) { + const auto t_g1 = std::chrono::steady_clock::now(); + prof->gcdext_time_ns += uint64_t(std::chrono::duration_cast(t_g1 - t_g0).count()); + } +#endif } mpz_mul(k, v2, c1); mpz_neg(k, k); - if (mpz_cmp_ui(s, 1)) { + const bool s_is_1 = (mpz_cmp_ui(s, 1) == 0); +#if defined(VDF_TEST) + if (prof != nullptr) { + if (s_is_1) ++prof->gcdext_s_eq_1; + else ++prof->gcdext_s_ne_1; + } +#endif + + if (!s_is_1) { mpz_fdiv_q(a1, a1, s); mpz_mul(c1, c1, s); } /* k = -(c*inv(b)) (mod a) */ - mpz_fdiv_r(k, k, a1); + // `mpz_fdiv_r` implements a floor-remainder; for positive modulus `a1`, we can + // compute the (typically faster) trunc-remainder and fix up negative results. + mpz_tdiv_r(k, k, a1); + if (mpz_sgn(k) < 0) mpz_add(k, k, a1); if (mpz_cmp(a1, L) < 0) { +#if defined(VDF_TEST) + if (prof != nullptr) ++prof->branch_a_lt_L; +#endif mpz_mul(t, a1, k); mpz_mul(r->a, a1, a1); @@ -265,17 +342,34 @@ void qfb_nudupl(qfb_t r, qfb_t f, mpz_t D, mpz_t L) mpz_fdiv_q(r->c, r->c, a1); } else { - mpz_t m2, r1, r2, co1, co2, temp; - - mpz_init(m2); mpz_init(r1); mpz_init(r2); - mpz_init(co1); mpz_init(co2); mpz_init(temp); +#if defined(VDF_TEST) + if (prof != nullptr) ++prof->branch_a_ge_L; + std::chrono::steady_clock::time_point t_else0; + if (timing) t_else0 = std::chrono::steady_clock::now(); +#endif + mpz_t& m2 = ctx.m2; + mpz_t& r1 = ctx.r1; + mpz_t& r2 = ctx.r2; + mpz_t& co1 = ctx.co1; + mpz_t& co2 = ctx.co2; + mpz_t& temp = ctx.temp; mpz_set(r2, a1); /* r1 = k */ mpz_swap(r1, k); /* Satisfies co2*r1 - co1*r2 == +/- r2_orig */ +#if defined(VDF_TEST) + std::chrono::steady_clock::time_point t_x0; + if (timing) t_x0 = std::chrono::steady_clock::now(); +#endif mpz_xgcd_partial(co2, co1, r2, r1, L); +#if defined(VDF_TEST) + if (timing) { + const auto t_x1 = std::chrono::steady_clock::now(); + prof->xgcd_partial_time_ns += uint64_t(std::chrono::duration_cast(t_x1 - t_x0).count()); + } +#endif /* m2 = b * r1 */ mpz_mul(m2, f->b, r1); @@ -310,18 +404,15 @@ void qfb_nudupl(qfb_t r, qfb_t f, mpz_t D, mpz_t L) mpz_neg(r->a, r->a); mpz_neg(r->c, r->c); } - - mpz_clear(m2); mpz_clear(r1); mpz_clear(r2); - mpz_clear(co1); mpz_clear(co2); mpz_clear(temp); +#if defined(VDF_TEST) + if (timing) { + const auto t_else1 = std::chrono::steady_clock::now(); + prof->else_branch_time_ns += uint64_t(std::chrono::duration_cast(t_else1 - t_else0).count()); + } +#endif } mpz_set(r->b, cb); - - mpz_clear(cb); - mpz_clear(k); - mpz_clear(s); - mpz_clear(t); mpz_clear(u2); mpz_clear(v2); - mpz_clear(a1); mpz_clear(c1); } // a = b * b diff --git a/src/nudupl_listener.h b/src/nudupl_listener.h new file mode 100644 index 00000000..8aec2e19 --- /dev/null +++ b/src/nudupl_listener.h @@ -0,0 +1,19 @@ +#ifndef CHIAVDF_NUDUPL_LISTENER_H +#define CHIAVDF_NUDUPL_LISTENER_H + +#include "include.h" + +// Notification types for `INUDUPLListener::OnIteration`. +// +// NL_SQUARESTATE: payload is `square_state_type*` (x86/x64 phased pipeline only). +// NL_FORM: payload is `vdf_original::form*` (used by both the original slow loop and the ARM NUDUPL loop via a view). +#define NL_SQUARESTATE 1 +#define NL_FORM 2 + +class INUDUPLListener { +public: + virtual ~INUDUPLListener() = default; + virtual void OnIteration(int type, void* data, uint64 iteration) = 0; +}; + +#endif // CHIAVDF_NUDUPL_LISTENER_H diff --git a/src/parameters.h b/src/parameters.h index c9d89ce6..85481e4e 100644 --- a/src/parameters.h +++ b/src/parameters.h @@ -38,7 +38,7 @@ bool enable_avx512_ifma=false; #define ARCH_X86 #elif defined(__x86_64__) || defined(_M_X64) #define ARCH_X64 -#elif (defined(__arm__) && defined(__ARM_ARCH) && __ARM_ARCH >= 5) || (defined(_M_ARM) && _M_ARM >= 5) || defined(__ARM_FEATURE_CLZ) /* ARM (Architecture Version 5) */ +#elif defined(__aarch64__) || (defined(__arm__) && defined(__ARM_ARCH) && __ARM_ARCH >= 5) || (defined(_M_ARM) && _M_ARM >= 5) || defined(__ARM_FEATURE_CLZ) /* ARM (aarch64 or Architecture Version 5+) */ #define ARCH_ARM #endif diff --git a/src/prover_test.cpp b/src/prover_test.cpp index 30d5e198..7b453cd3 100644 --- a/src/prover_test.cpp +++ b/src/prover_test.cpp @@ -2,11 +2,32 @@ #include "verifier.h" #include "create_discriminant.h" #include +#include +#include +#include +#include +#include int segments = 7; int thread_count = 3; std::atomic stop_signal{false}; +static bool env_truthy(const char* name) +{ + const char* v = std::getenv(name); + if (v == nullptr) return false; + if (v[0] == '\0') return false; + // Accept common "truthy" strings. + if (!std::strcmp(v, "1")) return true; + if (!std::strcmp(v, "true")) return true; + if (!std::strcmp(v, "TRUE")) return true; + if (!std::strcmp(v, "yes")) return true; + if (!std::strcmp(v, "YES")) return true; + if (!std::strcmp(v, "on")) return true; + if (!std::strcmp(v, "ON")) return true; + return false; +} + Proof CreateProof(integer D, ProverManager& pm, uint64_t iteration) { Proof proof = pm.Prove(iteration); if (!stop_signal) { @@ -60,10 +81,30 @@ int main() { ProverManager pm(D, (FastAlgorithmCallback*)weso, fast_storage, segments, thread_count); pm.start(); std::vector threads; - for (int i = 0; i <= 30; i++) { - threads.emplace_back(CreateProof, D, std::ref(pm), (1 << 21) * i + 60000); + + // This binary is used by CI as a correctness test. Historically it also served as a 5-minute + // soak/stress test; that dominates the wall-clock runtime of the "all tests" run. + // + // Default behavior: run the historical long/soak test. + // Fast/CI-friendly mode: set `CHIAVDF_PROVER_TEST_FAST=1` to run just a few proofs and exit. + const bool fast_mode = env_truthy("CHIAVDF_PROVER_TEST_FAST"); + const bool is_ci = (std::getenv("CI") != nullptr) || (std::getenv("GITHUB_ACTIONS") != nullptr); + + if (!fast_mode) { + for (int i = 0; i <= 30; i++) { + threads.emplace_back(CreateProof, D, std::ref(pm), (1ULL << 21) * uint64_t(i) + 60000); + } + std::this_thread::sleep_for(std::chrono::seconds(300)); + } else { + // Keep iterations small enough to complete quickly on CI runners. + const int max_i = is_ci ? 3 : 6; + for (int i = 0; i < max_i; i++) { + threads.emplace_back(CreateProof, D, std::ref(pm), (1ULL << 18) * uint64_t(i) + 60000); + } + for (auto& t : threads) t.join(); + threads.clear(); } - std::this_thread::sleep_for (std::chrono::seconds(300)); + stop_signal = true; std::cout << "Stopping everything.\n"; pm.stop(); diff --git a/src/vdf.h b/src/vdf.h index 9ab4aef4..e83c323a 100644 --- a/src/vdf.h +++ b/src/vdf.h @@ -3,15 +3,16 @@ #include "include.h" +#if defined(ARCH_X86) || defined(ARCH_X64) #include +#endif #include "parameters.h" #include "bit_manipulation.h" #include "double_utility.h" #include "integer.h" - -#include "asm_main.h" +#include "alloc.hpp" #include "vdf_original.h" @@ -21,21 +22,30 @@ #include "gpu_integer.h" #include "gpu_integer_divide.h" +#include "nucomp.h" + +#include "nudupl_listener.h" + +#if defined(ARCH_X86) || defined(ARCH_X64) +#include "asm_main.h" + #include "gcd_base_continued_fractions.h" //#include "gcd_base_divide_table.h" #include "gcd_128.h" #include "gcd_unsigned.h" -#include "gpu_integer_gcd.h" - #include "asm_types.h" #include "threading.h" #include "avx512_integer.h" -#include "nucomp.h" #include "vdf_fast.h" +#endif + +#include "gpu_integer_gcd.h" +#if defined(ARCH_X86) || defined(ARCH_X64) #include "vdf_test.h" +#endif #include #include @@ -99,6 +109,51 @@ void repeated_square_original(vdf_original &vdfo, form& f, const integer& D, con mpz_set(f.c.impl, f_res->c); } +// Slow squaring helper using the C++ NUDUPL implementation (`qfb_nudupl`) plus Pulmark reduction. +// +// This is substantially faster than `vdf_original::square()` on some platforms (notably ARM). +// We intentionally keep the *corruption* correction path on the independent `vdf_original` +// implementation. +static inline void repeated_square_nudupl( + form& f, + integer& D, + integer& L, + uint64 base, + uint64 iterations, + WesolowskiCallback* weso, + INUDUPLListener* nuduplListener +) { + vdf_original::form f_view; + // Defensive fallback: if `weso` is null, use a Pulmark reducer. + // Construct it once per call (it does heap work) rather than per reduction. + std::optional fallback_reducer; + if (weso == nullptr) { + fallback_reducer.emplace(); + } + for (uint64_t i = 0; i < iterations; i++) { + nudupl_form(f, f, D, L); + + // Reduce only when `a` grows beyond a small limb threshold. Reducing every iteration + // can be slower than letting NUDUPL run a bit "wide". + if (__GMP_ABS(f.a.impl->_mp_size) > 8) { + if (weso) { + weso->reduce(f); + } else { + fallback_reducer->reduce(f); + } + } + + if (nuduplListener != nullptr) { + // Present the C++ `form` as a `vdf_original::form` view so existing callbacks can + // consume it without any new type tags. + f_view.a[0] = f.a.impl[0]; + f_view.b[0] = f.b.impl[0]; + f_view.c[0] = f.c.impl[0]; + nuduplListener->OnIteration(NL_FORM, &f_view, base + i); + } + } +} + // thread safe; but it is only called from the main thread void repeated_square(uint64_t iterations, form f, const integer& D, const integer& L, WesolowskiCallback* weso, FastStorage* fast_storage, std::atomic& stopped) @@ -123,8 +178,10 @@ void repeated_square(uint64_t iterations, form f, const integer& D, const intege f_copy=f; c_checkpoint_interval=1; + #if defined(ARCH_X86) || defined(ARCH_X64) f_copy_3=f; f_copy_3_valid=square_fast_impl(f_copy_3, D, L, num_iterations); + #endif } #endif @@ -135,11 +192,19 @@ void repeated_square(uint64_t iterations, form f, const integer& D, const intege repeated_square_original(*weso->vdfo, f, D, L, 100); //randomize the a and b values #endif - // This works single threaded + uint64 actual_iterations = 0; +#if defined(ARCH_X86) || defined(ARCH_X64) + // x86/x64: use the phased pipeline. square_state_type square_state; - square_state.pairindex=0; - - uint64 actual_iterations=repeated_square_fast(square_state, f, D, L, num_iterations, batch_size, weso); + square_state.pairindex = 0; + actual_iterations = repeated_square_fast(square_state, f, D, L, num_iterations, batch_size, weso); +#else + // Non-x86: use the C++ NUDUPL path (faster and lower maintenance than the phased pipeline). + integer& D_nc = const_cast(D); + integer& L_nc = const_cast(L); + repeated_square_nudupl(f, D_nc, L_nc, num_iterations, batch_size, weso, weso); + actual_iterations = batch_size; +#endif #ifdef VDF_TEST ++num_calls_fast; diff --git a/src/vdf_bench.cpp b/src/vdf_bench.cpp index fdd121e0..aab8feeb 100644 --- a/src/vdf_bench.cpp +++ b/src/vdf_bench.cpp @@ -2,16 +2,19 @@ #include "bit_manipulation.h" #include "double_utility.h" #include "parameters.h" -#include "asm_main.h" #include "integer.h" +#include "alloc.hpp" #include "vdf_new.h" #include "nucomp.h" #include "picosha2.h" #include "proof_common.h" +#if defined(ARCH_X86) || defined(ARCH_X64) +#include "asm_main.h" #include "threading.h" #include "avx512_integer.h" #include "vdf_fast.h" +#endif #include "create_discriminant.h" #include @@ -46,6 +49,7 @@ int main(int argc, char **argv) auto t1 = std::chrono::high_resolution_clock::now(); if (!strcmp(argv[1], "square_asm")) { is_asm = true; +#if defined(ARCH_X86) || defined(ARCH_X64) for (i = 0; i < iters; ) { square_state_type sq_state; sq_state.pairindex = 0; @@ -64,6 +68,17 @@ int main(int argc, char **argv) i += done; } } +#else + // On non-x86 architectures we don't build the phased/asm pipeline. + // Keep script compatibility by treating `square_asm` as a NUDUPL benchmark. + for (i = 0; i < iters; i++) { + nudupl_form(y, y, D, L); + if (__GMP_ABS(y.a.impl->_mp_size) > 8) { + reducer.reduce(y); + } + } + is_asm = false; +#endif } else if (!strcmp(argv[1], "square")) { for (i = 0; i < iters; i++) { nudupl_form(y, y, D, L); diff --git a/src/vdf_fast.h b/src/vdf_fast.h index 34d47586..571a0d04 100644 --- a/src/vdf_fast.h +++ b/src/vdf_fast.h @@ -994,13 +994,7 @@ struct square_state_type { }*/ }; -#define NL_SQUARESTATE 1 -#define NL_FORM 2 - -class INUDUPLListener{ -public: - virtual void OnIteration(int type, void *data, uint64 iteration)=0; -}; +#include "nudupl_listener.h" //this should never have an infinite loop //the gcd loops all have maximum counters after which they'll error out, and the thread_state loops also have a maximum spin counter diff --git a/src/xgcd_partial.c b/src/xgcd_partial.c index 536e6c19..7e7adf49 100644 --- a/src/xgcd_partial.c +++ b/src/xgcd_partial.c @@ -24,32 +24,133 @@ #define _XGCD_PARTIAL #include +#include + +#if defined(_MSC_VER) +#include + +// MSVC doesn't provide `__builtin_clz*`; use bit-scan intrinsics instead. +static inline int chiavdf_clz_u32(unsigned long x) +{ + unsigned long idx = 0; + _BitScanReverse(&idx, x); // x != 0 + return 31 - (int)idx; +} + +static inline int chiavdf_clz_u64(unsigned __int64 x) +{ + unsigned long idx = 0; +#if defined(_M_X64) || defined(_M_ARM64) + _BitScanReverse64(&idx, x); // x != 0 + return 63 - (int)idx; +#else + // 32-bit targets: synthesize using two 32-bit scans. + const unsigned long hi = (unsigned long)(x >> 32); + if (hi != 0) { + _BitScanReverse(&idx, hi); + return 31 - (int)idx; + } + const unsigned long lo = (unsigned long)(x & 0xffffffffu); + _BitScanReverse(&idx, lo); + return 63 - (int)idx; +#endif +} +#endif + +// Fast helpers (avoid mpz temporaries in tight loops). +static inline mp_limb_signed_t chiavdf_mpz_bitlen_nonneg(const mpz_t x) +{ + // Match mpz_sizeinbase(x, 2) for x >= 0: + // - returns 1 for x == 0 + // - otherwise returns exact bit length + const size_t n = mpz_size(x); // number of limbs (abs) + if (n == 0) return 1; + const mp_limb_t top = mpz_getlimbn(x, (mp_size_t)(n - 1)); + // top is non-zero when n != 0, but be defensive. + if (top == 0) return 1; +#if GMP_LIMB_BITS == 64 +#if defined(_MSC_VER) + const int lead = chiavdf_clz_u64((unsigned __int64)top); +#else + const int lead = __builtin_clzll((unsigned long long)top); +#endif +#elif GMP_LIMB_BITS == 32 +#if defined(_MSC_VER) + const int lead = chiavdf_clz_u32((unsigned long)top); +#else + const int lead = __builtin_clz((unsigned int)top); +#endif +#else + // Fallback (unlikely): conservative loop. + int lead = 0; + for (int b = GMP_LIMB_BITS - 1; b >= 0; --b) { + if ((top >> b) & 1) break; + ++lead; + } +#endif + const mp_limb_signed_t top_bits = (mp_limb_signed_t)(GMP_LIMB_BITS - lead); + return (mp_limb_signed_t)((n - 1) * (size_t)GMP_LIMB_BITS) + top_bits; +} + +static inline mp_limb_signed_t chiavdf_mpz_extract_uword_from_shift_nonneg(const mpz_t x, mp_limb_signed_t shift_bits) +{ + // Return the low word of (x >> shift_bits), assuming x >= 0 and shift_bits >= 0. + // This is what `mpz_get_ui(tmp)` would yield after `mpz_tdiv_q_2exp(tmp, x, shift_bits)`, + // but without allocating or touching an mpz temp. + if (shift_bits <= 0) { + // limb 0 is enough for our use here. + return (mp_limb_signed_t)mpz_getlimbn(x, 0); + } + const mp_limb_signed_t limb_bits = (mp_limb_signed_t)GMP_LIMB_BITS; + const mp_limb_signed_t limb_idx = shift_bits / limb_bits; + const mp_limb_signed_t off = shift_bits - limb_idx * limb_bits; + mp_limb_t lo = mpz_getlimbn(x, (mp_size_t)limb_idx); + if (off == 0) return (mp_limb_signed_t)lo; + mp_limb_t hi = mpz_getlimbn(x, (mp_size_t)(limb_idx + 1)); + lo >>= (unsigned)off; + hi <<= (unsigned)(limb_bits - off); + return (mp_limb_signed_t)(lo | hi); +} void mpz_xgcd_partial(mpz_t co2, mpz_t co1, mpz_t r2, mpz_t r1, const mpz_t L) { - mpz_t q, r; + // Hot-path note: + // This function can run in the inner loop of NUDUPL; avoid per-call + // `mpz_init/mpz_clear` by using thread-local temporaries. + // + // Important for ASAN/LSan: these temporaries must be freed at thread-exit, otherwise + // LeakSanitizer will report per-thread GMP allocations as leaked. + struct chiavdf_xgcd_partial_tls { + mpz_t q; + mpz_t r; + + chiavdf_xgcd_partial_tls() { mpz_init(q); mpz_init(r); } + ~chiavdf_xgcd_partial_tls() { mpz_clear(q); mpz_clear(r); } + + chiavdf_xgcd_partial_tls(const chiavdf_xgcd_partial_tls&) = delete; + chiavdf_xgcd_partial_tls& operator=(const chiavdf_xgcd_partial_tls&) = delete; + }; + static thread_local chiavdf_xgcd_partial_tls tls; + mpz_ptr q = tls.q; + mpz_ptr r = tls.r; mp_limb_signed_t aa2, aa1, bb2, bb1, rr1, rr2, qq, bb, t1, t2, t3, i; mp_limb_signed_t bits, bits1, bits2; - mpz_init(q); mpz_init(r); - mpz_set_ui(co2, 0); mpz_set_si(co1, -1); while (mpz_cmp_ui(r1, 0) && mpz_cmp(r1, L) > 0) { - bits2 = mpz_sizeinbase(r2, 2); - bits1 = mpz_sizeinbase(r1, 2); + // r2/r1 are expected to be nonnegative here (algorithm maintains sign after each step). + bits2 = chiavdf_mpz_bitlen_nonneg(r2); + bits1 = chiavdf_mpz_bitlen_nonneg(r1); bits = __GMP_MAX(bits2, bits1) - GMP_LIMB_BITS + 1; if (bits < 0) bits = 0; - mpz_tdiv_q_2exp(r, r2, bits); - rr2 = mpz_get_ui(r); - mpz_tdiv_q_2exp(r, r1, bits); - rr1 = mpz_get_ui(r); - mpz_tdiv_q_2exp(r, L, bits); - bb = mpz_get_ui(r); + rr2 = chiavdf_mpz_extract_uword_from_shift_nonneg(r2, bits); + rr1 = chiavdf_mpz_extract_uword_from_shift_nonneg(r1, bits); + bb = chiavdf_mpz_extract_uword_from_shift_nonneg(L, bits); aa2 = 0; aa1 = 1; bb2 = 1; bb1 = 0; @@ -77,7 +178,9 @@ void mpz_xgcd_partial(mpz_t co2, mpz_t co1, if (i == 0) { - mpz_fdiv_qr(q, r2, r2, r1); + // r2,r1 are nonnegative here; trunc and floor division are equivalent, and + // `mpz_tdiv_qr` avoids extra sign-handling overhead. + mpz_tdiv_qr(q, r2, r2, r1); mpz_swap(r2, r1); mpz_submul(co2, co1, q); @@ -118,7 +221,5 @@ void mpz_xgcd_partial(mpz_t co2, mpz_t co1, mpz_neg(co2, co2); mpz_neg(co1, co1); mpz_neg(r2, r2); } - - mpz_clear(q); mpz_clear(r); } #endif /* _XGCD_PARTIAL */