From 816901a8c6db4aa5c231319823fd334319c7ada7 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Thu, 2 Apr 2026 13:46:13 -0500 Subject: [PATCH 01/60] VoxelBlockManager: add resolveWenoLeafPtrs and computeWenoStencil Add WenoLeafPtrs, resolveWenoLeafPtrs, and computeWenoStencil as static __device__ members of VoxelBlockManager. These implement the first phase of a two-function WENO5 stencil gather: resolveWenoLeafPtrs performs exactly 3 probeLeaf calls (one per axis) to resolve neighbor leaf pointers; computeWenoStencil fills a caller-provided array with the 19 global sequential indices using WenoPt::idx. voxelOffset arithmetic uses octal notation: NanoVDB leaf layout encodes (x,y,z) as x*64+y*8+z, so x/y/z strides are 0100/010/1 in octal. WenoPt::idx is used throughout to remain independent of any future re-alignment with OpenVDB's NineteenPt (which uses a different convention). Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Efty Sifakis --- .../nanovdb/tools/cuda/VoxelBlockManager.cuh | 139 ++++++++++++++++++ 1 file changed, 139 insertions(+) diff --git a/nanovdb/nanovdb/tools/cuda/VoxelBlockManager.cuh b/nanovdb/nanovdb/tools/cuda/VoxelBlockManager.cuh index 02f5180f67..d53839cab3 100644 --- a/nanovdb/nanovdb/tools/cuda/VoxelBlockManager.cuh +++ b/nanovdb/nanovdb/tools/cuda/VoxelBlockManager.cuh @@ -30,6 +30,7 @@ #include #include #include +#include namespace nanovdb { @@ -166,6 +167,144 @@ struct VoxelBlockManager : nanovdb::tools::VoxelBlockManagerBase } } } + + /// @brief Auxiliary type holding the resolved neighbor leaf pointers for + /// the WENO5 stencil. ptrs[axis][0] is the lo neighbor along that axis + /// (nullptr if outside the narrow band), ptrs[axis][1] is always the + /// center leaf, and ptrs[axis][2] is the hi neighbor (nullptr if outside). + template + struct WenoLeafPtrs { + const NanoLeaf* ptrs[3][3]; + }; + + /// @brief Resolve the neighbor leaf pointers needed by computeWenoStencil. + /// Performs exactly one probeLeaf call per axis (three total). Safe to call + /// per-thread; does not synchronize. + /// @tparam BuildT Build type of the grid (must be an index type) + /// @param grid Device-resident grid + /// @param leaf Center leaf node for the current voxel + /// @param voxelOffset Intra-leaf voxel offset for the current voxel + /// @return WenoLeafPtrs with center entries set to &leaf and lo/hi entries + /// set to the probeLeaf result (nullptr if outside the narrow band) + template + __device__ + static typename util::enable_if::is_index, WenoLeafPtrs>::type + resolveWenoLeafPtrs( + const NanoGrid* grid, + const NanoLeaf& leaf, + uint16_t voxelOffset) + { + WenoLeafPtrs result; + const auto coord = leaf.offsetToGlobalCoord(voxelOffset); + const auto localCoord = leaf.OffsetToLocalCoord(voxelOffset); + const auto& tree = grid->tree(); + + for (int axis = 0; axis < 3; ++axis) { + result.ptrs[axis][0] = nullptr; + result.ptrs[axis][1] = &leaf; + result.ptrs[axis][2] = nullptr; + + auto neighborCoord = coord; + neighborCoord[axis] += (localCoord[axis] & 0x4) ? 4 : -4; + result.ptrs[axis][(localCoord[axis] & 0x4) >> 1] = + tree.root().probeLeaf(neighborCoord); + } + return result; + } + + /// @brief Compute global sequential indices for the 19 WENO5 stencil + /// points of the given voxel, using pre-resolved leaf pointers. + /// + /// Output layout follows nanovdb::math::WenoPt::idx. Note that + /// this convention differs from OpenVDB's NineteenPt::idx. + /// + /// Entries for neighbors outside the narrow band are left unchanged; + /// the caller must zero-initialize data[] before calling this function. + /// Does not synchronize; safe to call from divergent threads. + /// + /// The voxelOffset arithmetic uses octal notation to exploit the fact that + /// the NanoVDB leaf layout encodes (x,y,z) as x*64 + y*8 + z, making x, y, + /// and z strides exactly 0100, 010, and 1 in octal respectively. + /// + /// @tparam BuildT Build type of the grid (must be an index type) + /// @param leaf Center leaf node for the current voxel + /// @param voxelOffset Intra-leaf voxel offset for the current voxel + /// @param leafPtrs Resolved neighbor leaf pointers from resolveWenoLeafPtrs + /// @param data Output array of length >= 19, caller-zero-initialized + template + __device__ + static typename util::enable_if::is_index, void>::type + computeWenoStencil( + const NanoLeaf& leaf, + uint16_t voxelOffset, + const WenoLeafPtrs& leafPtrs, + uint64_t* data) + { + using math::WenoPt; + const auto lc = leaf.OffsetToLocalCoord(voxelOffset); + + data[WenoPt< 0, 0, 0>::idx] = leaf.getValue(voxelOffset); + + // x-axis: stride per step = 64 = 0100 octal; cross-leaf wrap = ±8*64 = ±0700 + if (leafPtrs.ptrs[0][(lc.x() + 5) >> 3]) + data[WenoPt<-3, 0, 0>::idx] = leafPtrs.ptrs[0][(lc.x() + 5) >> 3]->getValue( + voxelOffset + ((lc[0] < 3) ? 0500 : -0300)); + if (leafPtrs.ptrs[0][(lc.x() + 6) >> 3]) + data[WenoPt<-2, 0, 0>::idx] = leafPtrs.ptrs[0][(lc.x() + 6) >> 3]->getValue( + voxelOffset + ((lc[0] < 2) ? 0600 : -0200)); + if (leafPtrs.ptrs[0][(lc.x() + 7) >> 3]) + data[WenoPt<-1, 0, 0>::idx] = leafPtrs.ptrs[0][(lc.x() + 7) >> 3]->getValue( + voxelOffset + ((lc[0] < 1) ? 0700 : -0100)); + if (leafPtrs.ptrs[0][(lc.x() + 9) >> 3]) + data[WenoPt< 1, 0, 0>::idx] = leafPtrs.ptrs[0][(lc.x() + 9) >> 3]->getValue( + voxelOffset + ((lc[0] < 7) ? 0100 : -0700)); + if (leafPtrs.ptrs[0][(lc.x() + 10) >> 3]) + data[WenoPt< 2, 0, 0>::idx] = leafPtrs.ptrs[0][(lc.x() + 10) >> 3]->getValue( + voxelOffset + ((lc[0] < 6) ? 0200 : -0600)); + if (leafPtrs.ptrs[0][(lc.x() + 11) >> 3]) + data[WenoPt< 3, 0, 0>::idx] = leafPtrs.ptrs[0][(lc.x() + 11) >> 3]->getValue( + voxelOffset + ((lc[0] < 5) ? 0300 : -0500)); + + // y-axis: stride per step = 8 = 010 octal; cross-leaf wrap = ±8*8 = ±070 + if (leafPtrs.ptrs[1][(lc.y() + 5) >> 3]) + data[WenoPt< 0,-3, 0>::idx] = leafPtrs.ptrs[1][(lc.y() + 5) >> 3]->getValue( + voxelOffset + ((lc[1] < 3) ? 0050 : -0030)); + if (leafPtrs.ptrs[1][(lc.y() + 6) >> 3]) + data[WenoPt< 0,-2, 0>::idx] = leafPtrs.ptrs[1][(lc.y() + 6) >> 3]->getValue( + voxelOffset + ((lc[1] < 2) ? 0060 : -0020)); + if (leafPtrs.ptrs[1][(lc.y() + 7) >> 3]) + data[WenoPt< 0,-1, 0>::idx] = leafPtrs.ptrs[1][(lc.y() + 7) >> 3]->getValue( + voxelOffset + ((lc[1] < 1) ? 0070 : -0010)); + if (leafPtrs.ptrs[1][(lc.y() + 9) >> 3]) + data[WenoPt< 0, 1, 0>::idx] = leafPtrs.ptrs[1][(lc.y() + 9) >> 3]->getValue( + voxelOffset + ((lc[1] < 7) ? 0010 : -0070)); + if (leafPtrs.ptrs[1][(lc.y() + 10) >> 3]) + data[WenoPt< 0, 2, 0>::idx] = leafPtrs.ptrs[1][(lc.y() + 10) >> 3]->getValue( + voxelOffset + ((lc[1] < 6) ? 0020 : -0060)); + if (leafPtrs.ptrs[1][(lc.y() + 11) >> 3]) + data[WenoPt< 0, 3, 0>::idx] = leafPtrs.ptrs[1][(lc.y() + 11) >> 3]->getValue( + voxelOffset + ((lc[1] < 5) ? 0030 : -0050)); + + // z-axis: stride per step = 1; cross-leaf wrap = ±8 + if (leafPtrs.ptrs[2][(lc.z() + 5) >> 3]) + data[WenoPt< 0, 0,-3>::idx] = leafPtrs.ptrs[2][(lc.z() + 5) >> 3]->getValue( + voxelOffset + ((lc[2] < 3) ? 0005 : -0003)); + if (leafPtrs.ptrs[2][(lc.z() + 6) >> 3]) + data[WenoPt< 0, 0,-2>::idx] = leafPtrs.ptrs[2][(lc.z() + 6) >> 3]->getValue( + voxelOffset + ((lc[2] < 2) ? 0006 : -0002)); + if (leafPtrs.ptrs[2][(lc.z() + 7) >> 3]) + data[WenoPt< 0, 0,-1>::idx] = leafPtrs.ptrs[2][(lc.z() + 7) >> 3]->getValue( + voxelOffset + ((lc[2] < 1) ? 0007 : -0001)); + if (leafPtrs.ptrs[2][(lc.z() + 9) >> 3]) + data[WenoPt< 0, 0, 1>::idx] = leafPtrs.ptrs[2][(lc.z() + 9) >> 3]->getValue( + voxelOffset + ((lc[2] < 7) ? 0001 : -0007)); + if (leafPtrs.ptrs[2][(lc.z() + 10) >> 3]) + data[WenoPt< 0, 0, 2>::idx] = leafPtrs.ptrs[2][(lc.z() + 10) >> 3]->getValue( + voxelOffset + ((lc[2] < 6) ? 0002 : -0006)); + if (leafPtrs.ptrs[2][(lc.z() + 11) >> 3]) + data[WenoPt< 0, 0, 3>::idx] = leafPtrs.ptrs[2][(lc.z() + 11) >> 3]->getValue( + voxelOffset + ((lc[2] < 5) ? 0003 : -0005)); + } }; /// @brief This functor calculates the firstLeafID and jumpMap for the From c9906f4362282885f378b5d9df6bd016932a8cb9 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Thu, 2 Apr 2026 13:46:20 -0500 Subject: [PATCH 02/60] ex_voxelBlockManager_host_cuda: add host/CUDA VBM example Adds the ex_voxelBlockManager_host_cuda example demonstrating the CPU and CUDA VoxelBlockManager implementations, along with design documentation. Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Efty Sifakis --- nanovdb/nanovdb/examples/CMakeLists.txt | 7 + .../DecodeInverseMapsCPUPlan.md | 1271 +++++++++++++++++ .../VBMImplementationKnowledge.md | 269 ++++ .../VoxelBlockManagerContext.md | 259 ++++ .../vbm_host_cuda.cpp | 96 ++ .../vbm_host_cuda_kernels.cu | 476 ++++++ 6 files changed, 2378 insertions(+) create mode 100644 nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/DecodeInverseMapsCPUPlan.md create mode 100644 nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/VBMImplementationKnowledge.md create mode 100644 nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/VoxelBlockManagerContext.md create mode 100644 nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/vbm_host_cuda.cpp create mode 100644 nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/vbm_host_cuda_kernels.cu diff --git a/nanovdb/nanovdb/examples/CMakeLists.txt b/nanovdb/nanovdb/examples/CMakeLists.txt index 780a4f59bd..898a412987 100644 --- a/nanovdb/nanovdb/examples/CMakeLists.txt +++ b/nanovdb/nanovdb/examples/CMakeLists.txt @@ -114,6 +114,13 @@ nanovdb_example(NAME "ex_merge_nanovdb_cuda" OPENVDB) nanovdb_example(NAME "ex_refine_nanovdb_cuda" OPENVDB) nanovdb_example(NAME "ex_coarsen_nanovdb_cuda" OPENVDB) +nanovdb_example(NAME "ex_voxelBlockManager_host_cuda") +if(TARGET ex_voxelBlockManager_host_cuda) + target_compile_options(ex_voxelBlockManager_host_cuda PRIVATE + $<$:-Xcompiler=-mavx2,-fopenmp-simd> + $<$:-mavx2 -fopenmp-simd>) +endif() + if(CUDAToolkit_FOUND) nanovdb_example(NAME "ex_make_mgpu_nanovdb") # requires cuRAND target_link_libraries(ex_make_mgpu_nanovdb PRIVATE CUDA::curand) diff --git a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/DecodeInverseMapsCPUPlan.md b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/DecodeInverseMapsCPUPlan.md new file mode 100644 index 0000000000..6628ab4eee --- /dev/null +++ b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/DecodeInverseMapsCPUPlan.md @@ -0,0 +1,1271 @@ +# CPU `decodeInverseMaps` Implementation Plan + +This document captures the full design for porting `VoxelBlockManager::decodeInverseMaps` +to the host. It is the companion to `VoxelBlockManagerContext.md` and serves as a starting point +for implementing the function in `nanovdb/tools/VoxelBlockManager.h`. + +--- + +## 1. Threading Model + +The GPU version uses a thread block (up to 512 threads) to decode one voxel block cooperatively. +The CPU version inverts the axes: + +- **Outer**: OpenMP thread parallelism over voxel blocks (one block per thread). +- **Inner**: SIMD within `decodeInverseMaps` for the single block assigned to the calling thread. + +`decodeInverseMaps` itself is single-threaded. It does not use OpenMP internally. The caller +(the `buildVoxelBlockManager` CPU path) is responsible for distributing blocks across threads. + +--- + +## 2. Outputs + +Like the GPU version, the function fills two arrays for a single voxel block: + +```cpp +uint32_t leafIndex[BlockWidth]; // smem_leafIndex on GPU +uint16_t voxelOffset[BlockWidth]; // smem_voxelOffset on GPU +``` + +Sentinel values for positions beyond the last active voxel: +```cpp +static constexpr uint32_t UnusedLeafIndex = 0xffffffff; +static constexpr uint16_t UnusedVoxelOffset = 0xffff; +``` + +--- + +## 3. Leaf Iteration (unchanged from GPU) + +Iterate leaf IDs from `firstLeafID[blockID]` through `firstLeafID[blockID] + nExtraLeaves`, +where `nExtraLeaves` is `popcount(jumpMap[blockID * JumpMapLength .. +JumpMapLength])`. + +For each leaf, compute: +- `leafFirstOffset = leaf.data()->firstOffset()` +- `leafValueCount = leaf.data()->valueCount()` ← number of active voxels in this leaf +- `pStart = max(0, leafFirstOffset - blockFirstOffset)` — first slot in the block's output arrays +- `pEnd = min(BlockWidth, leafFirstOffset + leafValueCount - blockFirstOffset)` — one past last slot +- `jStart = (leafFirstOffset < blockFirstOffset) ? blockFirstOffset - leafFirstOffset : 0` + — index of first active voxel in the leaf that falls inside this block + +Then: +- `leafIndex[pStart..pEnd) = leafID` (range fill, no scatter) +- `voxelOffset[pStart..pEnd) = leafLocalOffsets[jStart..jStart+(pEnd-pStart))` (contiguous copy) + +where `leafLocalOffsets[j]` = local offset (0..511) of the j-th active voxel in this leaf. + +--- + +## 4. Producing `leafLocalOffsets` via Stream Compaction + +`leafLocalOffsets` is the stream compaction of {0, 1, …, 511} under `valueMask`. It maps +dense index j → local offset of the j-th active voxel. + +The GPU scatter `smem_voxelOffset[index - blockFirstOffset] = localOffset` is *equivalent* to +this compaction: as `localOffset` increases 0..511, the scatter destination +`index - blockFirstOffset` is non-decreasing. So the scatter is really: pack all active local +offsets in order → copy a contiguous slice. + +### 4a. SIMD Word Layout: 16 × 32-bit Words + +Treat `valueMask` as **16 `uint32_t` words** (not 8 `uint64_t`). Rationale: + +- 16 words fill a full AVX-512 register (16 × 32-bit lanes) or two AVX2 registers (8 × 32-bit lanes). +- The multiply step in the vertical sweep (see §5) uses `VPMULLD` (AVX2, SSE4.1) for 32-bit + multiply, which is widely available. The 64-bit equivalent `VPMULLQ` requires AVX-512DQ. +- Software popcount on 32-bit words uses AND/shift/add/multiply — all available as 32-bit SIMD + ops in AVX2. A `#pragma omp simd` loop over 16 words auto-vectorizes without `VPOPCNTQ`. + +### 4b. Storage Layout: `prefixCountRealigned[32][16]` + +Declare a `uint32_t` array shaped as `[bitStep][lane]`: + +```cpp +alignas(32) uint32_t prefixCountRealigned[/*bitStep*/32][/*lane*/16]; +``` + +- **lane** (0..15): indexes which 32-bit word of the valueMask (one per group of 32 consecutive voxels). +- **bitStep** (0..31): indexes bit position within the word. +- `prefixCountRealigned[step][lane]` = **inclusive** prefix popcount = number of active voxels + in positions 0..step of word `lane`. + +Storage is `uint32_t` throughout to match `popcount32`'s natural precision and avoid +narrowing conversions. A full row `prefixCountRealigned[step]` is 16 × 4 = 64 bytes: +- **AVX2**: two `__m256i` registers per row (8 uint32_t each) +- **AVX-512**: one `__m512i` register per row (16 uint32_t) — the layout is designed for this upgrade + +### 4c. Phase 1 — Per-Word Inclusive Prefix Counts (SIMD) + +For each `step` in 0..31, compute `prefixCountRealigned[step][lane]` for all 16 lanes +simultaneously via a `#pragma omp simd` loop: + +```cpp +const uint32_t* maskWords = + reinterpret_cast(leaf.valueMask().words()); + +for (int step = 0; step < 32; step++) { + // TODO: use (uint32_t(2) << step) - 1u, NOT (1u << (step+1)) - 1u + // The latter is UB at step=31 (shift by 32 on a 32-bit type). + // The safe form: at step=31, (2u << 31) overflows to 0 (defined for unsigned), + // and 0 - 1u wraps to 0xFFFFFFFF (all bits set) — correct inclusive mask. + const uint32_t mask = (uint32_t(2) << step) - 1u; + #pragma omp simd + for (int lane = 0; lane < 16; lane++) + prefixCountRealigned[step][lane] = popcount32(maskWords[lane] & mask); +} +``` + +At `step=31`, `mask = 0xFFFFFFFF`, so `prefixCountRealigned[31][lane] = wordPopcount[lane]` +(the full per-word active voxel count) — no separate word-popcount pass needed. + +### 4d. Phase 2 — Cross-Word Prefix Sum and Global Conversion + +Read the last row to get per-word counts, compute their exclusive prefix scan (scalar — short +dependency chain), then add `baseOffset[lane]` to every row in a second SIMD pass: + +```cpp +// Exclusive prefix scan of the last row → baseOffset[lane] +uint32_t baseOffset[16]; +baseOffset[0] = 0; +for (int lane = 1; lane < 16; lane++) + baseOffset[lane] = baseOffset[lane-1] + prefixCountRealigned[31][lane-1]; + +// Add baseOffset to every row: converts per-word to global prefix counts +for (int step = 0; step < 32; step++) { + #pragma omp simd + for (int lane = 0; lane < 16; lane++) + prefixCountRealigned[step][lane] += baseOffset[lane]; +} +``` + +`baseOffset` is constant across all 32 steps for a given lane, so each row's SIMD add is a +simple lane-wise addition with no broadcast required. After this pass, +`prefixCountRealigned[step][lane]` holds the full global inclusive prefix count for voxel +`step + 32*lane` — i.e., the sequential index of that voxel within the leaf (0-based) if it +is active, counting all active voxels before it across all words. + +### 4e. Parallel Prefix Compaction via `shfl_down` (Alternative / Deeper SIMD) + +This approach avoids data-dependent stores entirely and is the approach validated in +`simd_test/shfl_down_test.cpp`. + +**Key insight**: Define `shifts[i]` = number of zeros before position i in the bitmask = +`i - (dense_index_of_voxel_i)`. The compaction moves each active voxel at position i down by +`shifts[i]`. Decompose `shifts[i]` in binary: apply log2(BlockWidth) passes. Pass k moves +elements down by 2^k *if* bit k of `shifts[i]` is set: + +```cpp +// Templated fixed-offset conditional blend +template +void shfl_down(uint16_t* data, const bool* move) { + #pragma omp simd + for (int i = 0; i < Width - Shift; i++) + if (move[i]) data[i] = data[i + Shift]; +} +``` + +Each pass is a **fixed-offset conditional copy** — the write index is data-independent. +Compiles to clean masked blend operations: +- **AVX-512**: `vmovdqu32` with a mask register (single instruction per pass) +- **AVX2**: double-negate blend pattern (no register-level shuffle needed) + +The `move[i]` predicate for pass k is: `(shifts[i] & (1 << k)) != 0`, which itself depends on +the bitmask but can be computed upfront via popcount before the blend passes. + +**Practical recommendation**: Start with the simpler vertical sweep (§4c/§4d). Fall back to +`shfl_down` if the compiler fails to vectorize the conditional store or if profiling shows it +is the bottleneck. + +**TODO**: Investigate whether this collective SIMD prefix-popcount approach could benefit the +CUDA `decodeInverseMaps` as well. The current GPU implementation iterates all 512 voxel slots +via `getValue()` (one thread per slot cooperatively across the warp), which is already very fast +(~0.039 ms for 16384 blocks). Given that baseline, a rewrite is unlikely to be worthwhile, but +it may be worth a quick look once the CPU path is mature. + +--- + +## 5. Bypassing `mPrefixSum` + +The leaf stores a packed 9-bit `mPrefixSum` for random access. Do **not** use it here. + +For bulk sequential access over all 512 voxels, recomputing per-word popcounts from scratch via +SIMD is cheaper than unpacking the 9-bit packed fields (which requires masked shifts and is +awkward to vectorize). The vertical sweep (§4c) naturally computes exactly what is needed. + +--- + +## 6. `leafIndex` Fill (Trivial) + +```cpp +std::fill(leafIndex + pStart, leafIndex + pEnd, (uint32_t)leafID); +``` + +No scatter. `leafID` is constant per leaf. + +--- + +## 7. `voxelOffset` Fill (Contiguous Copy) + +```cpp +std::copy(leafLocalOffsets + jStart, + leafLocalOffsets + jStart + (pEnd - pStart), + voxelOffset + pStart); +``` + +`leafLocalOffsets` is produced once per leaf (§4) and then sliced into the output array. + +--- + +## 8. Initialization + +Before iterating over leaves, initialize sentinel values for the whole block: + +```cpp +std::fill(leafIndex, leafIndex + BlockWidth, UnusedLeafIndex); +std::fill(voxelOffset, voxelOffset + BlockWidth, UnusedVoxelOffset); +``` + +**Important:** `std::fill` on a `threadprivate` TLS pointer does **not** auto-vectorize to AVX2 +stores even when `-mavx2` is enabled. The compiler cannot prove alignment through the TLS +indirection, so it falls back to scalar or SSE stores. Explicit AVX2 intrinsics with an +`(__m256i*)` cast are required to get `vmovdqa` and recover the expected bandwidth. On the test +machine (no AVX-512), using explicit `_mm256_store_si256` over `alignas(64)` arrays brought the +initialization cost from ~1.5 ms down to ~0.22 ms for 16384 blocks across 32 OMP threads. + +The same issue will affect the `voxelOffset` range-fill and `leafIndex` range-fill in the +optimized path (§6 and §7): if the output arrays are caller-allocated (stack or TLS), `std::fill` +and `std::copy` should be replaced with explicit AVX2 stores where performance matters. + +--- + +## 9. Function Signature (Proposed) + +The CPU version mirrors the GPU signature but with plain pointers (no `__device__`, no shared +memory, no sync): + +```cpp +template +template +void VoxelBlockManager::decodeInverseMaps( + const NanoGrid* grid, + uint32_t blockID, + const uint32_t* firstLeafID, + const uint64_t* jumpMap, + uint64_t blockFirstOffset, + uint32_t* leafIndex, // output, length BlockWidth + uint16_t* voxelOffset) // output, length BlockWidth +``` + +Or as a free function in an `cpu` sub-namespace alongside `buildVoxelBlockManager` in +`VoxelBlockManager.h`. + +--- + +## 10. Future Factoring + +Once `VoxelBlockManager` is annotated `__hostdev__` on all its members, the +per-leaf logic shared between the CPU and GPU builds can be factored into a `__hostdev__ static` +member (e.g., `accumulateLeafContribution(...)`) — see `project_vbm_factoring.md` in the memory +directory. The `decodeInverseMaps` CPU/GPU split is a separate concern (SIMD vs warp cooperation) +and will likely remain two implementations even after factoring. + +--- + +## 11. SIMD Codegen Experiment: `shfl_down` + +The `simd_test/` directory (not checked into the repo) contained two source files and four +assembly listings produced by GCC 13.3 (`-O3 -march=avx512f` / `-O3 -march=avx2`). + +### Source (both files identical except for the pragma) + +```cpp +// shfl_down_test.cpp — WITH #pragma omp simd +// shfl_down_nosimd.cpp — WITHOUT #pragma omp simd (testing auto-vectorization alone) + +// Conditional blend: for j in [0, Width-Shift): +// out[j] = (shifts[j+Shift] & Shift) ? in[j+Shift] : in[j] +// for j in [Width-Shift, Width): +// out[j] = in[j] +template +void shfl_down(const T* __restrict__ in, + const int* __restrict__ shifts, + T* __restrict__ out) +{ +#pragma omp simd // omitted in shfl_down_nosimd.cpp + for (int j = 0; j < Width - Shift; j++) + out[j] = (shifts[j + Shift] & Shift) ? in[j + Shift] : in[j]; + + for (int j = Width - Shift; j < Width; j++) + out[j] = in[j]; +} + +// Instantiated for Shift = 1, 2, 4, 8, 16, 32, 64 with T=uint32_t, Width=128 +``` + +### Assembly patterns observed + +**AVX-512** (both files produced identical output — auto-vectorization sufficed): +```asm +; Per 16-element chunk, for Shift=S: +vpbroadcastd S, %zmm0 ; broadcast shift constant +vpandd S*4(%rsi), %zmm0, %zmm2 ; mask = shifts[j+S] & S +vpcmpd $4, %zmm1, %zmm2, %k1 ; k1 = mask != 0 (take in[j+S]) +vpcmpd $0, %zmm1, %zmm2, %k2 ; k2 = mask == 0 (take in[j]) +vmovdqu32 S*4(%rdi), %zmm3{%k1}{z} ; load in[j+S] where mask != 0 +vmovdqu32 (%rdi), %zmm2{%k2}{z} ; load in[j] where mask == 0 +vmovdqa32 %zmm3, %zmm2{%k1} ; merge +vmovdqu32 %zmm2, (%rdx) ; store +``` +Each pass: 2 compares + 2 masked zero-loads + 1 masked merge + 1 store per 16 elements. + +**AVX2** (both files produced identical output): +```asm +; Per 8-element chunk: +vpand S*4(%rsi), %ymm1, %ymm3 ; mask = shifts[j+S] & S +vpcmpeqd %ymm0, %ymm3, %ymm3 ; ymm3 = (mask == 0) — "take in[j]" predicate +vpmaskmovd (%rdi), %ymm3, %ymm4 ; load in[j] where mask == 0 +vpcmpeqd %ymm0, %ymm3, %ymm2 ; ymm2 = (mask != 0) — "take in[j+S]" predicate +vpmaskmovd S*4(%rdi), %ymm2, %ymm2 ; load in[j+S] where mask != 0 +vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 ; blend: ymm3 selects in[j], ymm2 selects in[j+S] +vmovdqu %ymm2, (%rdx) ; store +``` +Each pass: `vpand` + 2×`vpcmpeqd` + 2×`vpmaskmovd` + `vpblendvb` + store per 8 elements. + +### Key findings + +1. **`#pragma omp simd` was not needed on GCC 13.3** — the `nosimd` version auto-vectorized + to identical output on both AVX-512 and AVX2. The pragma is still recommended for portability + across compilers with weaker auto-vectorization. It is safe to use without guards: unknown + pragmas are silently ignored by standard-conforming C++ compilers (C++17 §16.6), and all + major compilers recognize the `omp` namespace even without OpenMP enabled. + +2. **No architecture-specific intrinsics needed.** A single portable source compiles to optimal + SIMD on both targets. + +3. **No register-level shuffle instructions** (`vpermps`, `vpshufb`, etc.) appear anywhere. The + fixed compile-time offset is treated as a constant address displacement — the "shuffle" is + simply a load from `in + Shift`, which is a free addressing mode. + +4. **AVX-512 is cleaner**: 5 instructions vs AVX2's 7 per chunk, and uses mask registers + instead of `vpblendvb`. + +5. **Software `popcount32`** (Hamming weight via AND/shift/add/multiply) auto-vectorizes to + `VPMULLD` on both AVX2 and AVX-512. `VPOPCNTQ` (AVX-512VPOPCNTDQ) is **not** required. + +6. **`__restrict__` is load-bearing, not just a hint.** Without it the compiler must assume + `in` and `out` may alias, making vectorization of the loop illegal (writes to `out[j]` could + affect subsequent reads of `in[j+Shift]`). The experiment results are only valid because + `__restrict__` was present. + + `__restrict__` is a compiler extension, not standard C++ (`restrict` is C99 only). For + portability a macro is needed. NanoVDB has no existing C++ macro for this — `CNanoVDB.h` + defines `RESTRICT __restrict` but that is for the C API only. A new macro should be added: + + ```cpp + #if defined(_MSC_VER) + # define NANOVDB_RESTRICT __restrict + #else + # define NANOVDB_RESTRICT __restrict__ + #endif + ``` + + This matches the pattern used by `_CCCL_RESTRICT` in the bundled CCCL dependency. + +--- + +## 12. Benchmarking Findings (ex_voxelBlockManager_host_cuda) + +Measurements on the test machine (32 OMP threads, BlockWidth=128, 16384 blocks / 2M active +voxels, AVX2 but no AVX-512). + +### Baseline numbers + +| Path | Time per full pass | +|------|--------------------| +| GPU `decodeInverseMaps` (all blocks, `benchDecodeKernel`) | ~0.039 ms | +| CPU `decodeInverseMaps`, 32 OMP threads, unoptimized (`getValue()` loop) | ~77 ms | +| CPU initialization only (AVX2 stores, 32 threads) | ~0.22 ms | +| CPU OMP scheduling overhead (empty loop body, 16384 iterations) | ~0.002 ms | + +The GPU/CPU gap is ~2000×. The `getValue()` loop accounts for essentially all of the CPU cost. + +### OMP parallelism + +The outer loop over blocks (`#pragma omp for schedule(static)`) parallelizes correctly — a +fill-only sanity check scaled from ~77ms (single-thread equivalent) to ~1.5ms with 32 threads +(~40×). However the full `decodeInverseMaps` showed **zero scaling** with OMP threads. This +confirms the bottleneck is memory-bandwidth or cache-thrashing in the `getValue()` traversal, +not compute: all 32 threads together saturate available bandwidth accessing leaf data, giving no +wall-time improvement over serial. + +### `getValue()` is the bottleneck + +`getValue(localOffset)` on a `ValueOnIndex` leaf accesses `mValueMask` and the packed +`mPrefixSum` field to compute the sequential index. It is read-only but touches leaf node data +for every one of 512 slots per leaf, for every leaf overlapping the block. The unoptimized path +is O(512 × nLeaves) memory accesses per block rather than O(64 bytes of valueMask) per leaf. +Replacing this with the prefix-array approach (§4) is the primary optimization target. + +### Build flags + +`-mavx2` must be passed explicitly to both the host compiler and nvcc (`-Xcompiler -mavx2`). +Without it, `std::fill` on TLS pointers generates scalar stores. The flag is set in +`examples/CMakeLists.txt` via `target_compile_options` for `ex_voxelBlockManager_host_cuda`. + +### `prefix_popcnt_bench` standalone micro-benchmark + +`prefix_popcnt_bench.cpp` (in the same directory) isolates the Phase 1 + Phase 2 computation — +1M blocks, each with a runtime-unknown 16-word mask generated by an LCG, single-threaded, +`prefixCountRealigned[32][16]` allocated outside the loop. Results on the test machine (AVX2, +no AVX-512, GCC 13.3, `-O3 -mavx2`): + +| Implementation | Min time (1M blocks) | ns/block | +|----------------|---------------------|----------| +| Auto-vectorized (`popcount32` + `#pragma omp simd`) | ~130 ms | ~124 ns | +| Auto-vectorized with `-mno-popcnt` | ~101 ms | ~96 ns | +| Explicit AVX2 intrinsics (`vpshufb` nibble-table) | ~70 ms | ~66.5 ns | + +**Key finding — `#pragma omp simd` is silently defeated by `-mavx2`.** +When hardware POPCNT is available (implied by `-mavx2` on x86), GCC replaces the `popcount32` +Hamming-weight expression with the scalar `popcntl` instruction and then runs the lane loop +scalar. The `#pragma omp simd` hint is ignored because the compiler considers scalar `popcntl` +cheaper than the vectorized software path. The result is 16 sequential `popcntl` calls per step, +not a SIMD operation across all 16 lanes. + +With `-mno-popcnt`, GCC falls back to the software Hamming weight and auto-vectorizes correctly +to the 2×`__m256i` path (~96 ns). However `-mno-popcnt` is not suitable for production (it +disables hardware POPCNT throughout the TU, including places like `countOn` where it is wanted). + +**Explicit `vpshufb` intrinsics** (`computePrefixPopcntAVX2` in `prefix_popcnt_bench.cpp`) +bypass this issue entirely: the nibble-table lookup uses ~10 SIMD instructions per step across +all 16 lanes, without any `popcntl` in sight. At ~66.5 ns/block this is **1.87× faster** than +the auto-vectorized baseline and is the approach to use in the optimized CPU `decodeInverseMaps`. + +`vpshufb` popcount recipe (8 uint32 lanes per `__m256i`, applied twice for 16 lanes): +```cpp +// lut[i] = popcount(i), for i in 0..15, replicated in both 128-bit lanes +const __m256i lut = _mm256_set_epi8(4,3,3,2,3,2,2,1,3,2,2,1,2,1,1,0, + 4,3,3,2,3,2,2,1,3,2,2,1,2,1,1,0); +const __m256i low4 = _mm256_set1_epi8(0x0f); +const __m256i ones8 = _mm256_set1_epi8(1); +const __m256i ones16= _mm256_set1_epi16(1); + +__m256i lo = _mm256_and_si256(v, low4); // low nibbles +__m256i hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), low4); // high nibbles +__m256i cnt = _mm256_add_epi8(_mm256_shuffle_epi8(lut, lo), + _mm256_shuffle_epi8(lut, hi)); // byte popcounts +__m256i s = _mm256_madd_epi16(_mm256_maddubs_epi16(cnt, ones8), ones16); // sum → 32-bit +``` +`maskWords` (constant across all 32 steps) should be loaded into two `__m256i` registers once +before the step loop — the compiler will hoist the broadcasts of `lut`, `low4`, `ones8`, `ones16` +automatically. + +--- + +## 13. Alternative Algorithm: Bit-Parallel Z+Y Prefix Sum + +This section records an alternative algorithm under investigation for computing +`uint16_t prefixSums[512]` (exclusive linear prefix popcount per voxel) from a `Mask<3>` +(`valueMask` of a leaf node), using bit-parallel operations on the 8 × 64-bit mask words. +The algorithm is implemented and tested in `simd_test/within_word_prefix_test.cpp`. + +### 13a. Data Layout + +```cpp +union qword { uint64_t ui64; uint8_t ui8[8]; }; +static constexpr uint64_t kSpread = 0x0101010101010101ULL; + +qword data[8][8]; // indexed [z][x] +// data[z][x].ui8[y] ↔ voxel (x, y, z), x = word index, y*8+z = bit within word +``` + +NanoVDB leaf linear index: `i = x*64 + y*8 + z`. Word index = x (0..7), within-word bit +position = y\*8+z, with z as fast index (bits 0..2 of each 8-bit group) and y as slow (byte +index 0..7 within the 64-bit word). + +`data[z][:]` is contiguous — 64 bytes = one cache line = two YMM registers. This enables +`#pragma omp simd` over x in both passes below. + +### 13b. Z-Pass: Indicator Fill + Running Sum + +```cpp +// z=0: extract bit 0 from each byte of each word +#pragma omp simd +for (int x = 0; x < 8; x++) + data[0][x].ui64 = maskWords[x] & kSpread; + +// z=1..7: accumulate bit z from each byte, running sum over z +for (int z = 1; z < 8; z++) { + #pragma omp simd + for (int x = 0; x < 8; x++) + data[z][x].ui64 = data[z-1][x].ui64 + ((maskWords[x] >> z) & kSpread); +} +``` + +After this pass: `data[z][x].ui8[y]` = Σ_{z'≤z} bit(x, y, z') — per-column z-prefix for +each (x, y). Per-byte maximum = 8; fits in `uint8_t` with no inter-byte carry. `vpaddq` +and `vpaddb` are equivalent here. + +**Latency hiding**: the indicator fill `(maskWords[x] >> z) & kSpread` is independent of +`data[z-1][x]`, so the OOO engine can issue it during the 1-cycle `vpaddq` latency. The +7-step dependency chain runs at ~1 cycle/step (throughput-bound, not latency-bound). + +### 13c. Y-Pass: Hillis-Steele Prefix Scan Within uint64 + +```cpp +for (int z = 0; z < 8; z++) { + #pragma omp simd + for (int x = 0; x < 8; x++) { + data[z][x].ui64 += data[z][x].ui64 << 8; + data[z][x].ui64 += data[z][x].ui64 << 16; + data[z][x].ui64 += data[z][x].ui64 << 32; + } +} +``` + +`vpsllq imm8` is fully supported in AVX2 (1-cycle throughput, 1-cycle latency). Per-byte +maximum after this pass: 64 (8 z-values × 8 y-values); still fits in `uint8_t`. No +inter-byte carry corruption since bytes evolve independently under byte-parallel arithmetic. + +After this pass: `data[z][x].ui8[y]` = **2D rectangle inclusive sum** = +Σ_{y'≤y, z'≤z} bit(x, y', z'). + +### 13d. Assembly Quality (GCC 13.3, -O3 -march=core-avx2) + +The compiler fully unrolls both passes and keeps all intermediate values register-resident. +The z-pass processes `data[z][:]` two YMM registers at a time (x=0..3 and x=4..7), with +one spill (z=7, x=0..3 half) due to requiring all 16 YMM registers simultaneously. The +y-pass operates directly on the register-resident z-pass results without reloading from +memory. The only missed optimization is 16 dead stores from the z-pass that are immediately +overwritten by the y-pass. Overall this is essentially what hand-written intrinsics would +produce. + +### 13e. 2D Rectangle vs Linear Prefix (Correctness Finding) + +**Key finding from `simd_test/within_word_prefix_test.cpp`**: the z+y algorithm computes a +**2D rectangle sum**, not the linear prefix sum that `getValue()` uses. + +`getValue()` for `ValueOnIndex` computes: `countOn(w & ((1ULL << (y*8+z)) - 1))` = exclusive +count of set bits at positions 0..y\*8+z−1 within word x. This is a **linear** prefix (a +staircase: all bits in rows 0..y−1 plus bits in row y up to column z). + +The 2D rectangle sum Σ_{y'≤y, z'≤z} bit(x,y',z') counts only up to column z in every +preceding row, missing the "row tails" for y' < y. Test result on 1000 random masks: +2D rectangle matches its own reference at 100% (512000/512000); linear inclusive match is +only ~26% (132806/512000), confirming the discrepancy. + +First mismatch example: at (x=0, y=1, z=0), 2D rect = 2 (bits at y=0,z=0 and y=1,z=0), +linear inclusive = 7 (all 7 bits at positions 0..8 in the word). + +### 13f. Rectangle→Linear Fixup + +The linear inclusive prefix at (x, y, z) can be recovered from the 2D rectangle data as: + +``` +linear_incl(x, y, z) = data[7][x].ui8[y-1] // all complete rows 0..y-1 (z'=0..7) + + data[z][x].ui8[y] // current row y, columns 0..z + - data[z][x].ui8[y-1] // subtract over-counted rectangle below +``` + +This simplifies to adding a y-dependent correction, expressible as a byte-parallel operation: + +```cpp +for (int z = 0; z < 8; z++) { + #pragma omp simd + for (int x = 0; x < 8; x++) + data[z][x].ui64 += (data[7][x].ui64 - data[z][x].ui64) >> 8; +} +``` + +`data[7][x].ui64` (available in registers after the y-pass) gives the full per-row popcounts +packed in bytes; the byte-shift-right-by-8 shifts row y−1's value into row y's byte lane. +This fixup is cheap — one subtract and one shift per (z, x) pair, all in-place in the +byte-packed representation. + +### 13g. Cross-Word Offsets (mPrefixSum) + +`LeafIndexBase::mPrefixSum` stores 7 nine-bit cumulative popcounts (the exclusive prefix +scan at word boundaries): + +- bits 0–8: Σ_{j=0}^{0} countOn(words[j]) = exclusive prefix at x=1 +- bits 9–17: Σ_{j=0}^{1} countOn(words[j]) = exclusive prefix at x=2 +- ... +- bits 54–62: Σ_{j=0}^{6} countOn(words[j]) = exclusive prefix at x=7 + +These are available for free and must be added to `data[z][x].ui8[y]` to obtain the full +global sequential index. However, these offsets require up to 9 bits (max value = 512), +which exceeds `uint8_t`. Two approaches for incorporating them: + +**Approach #1 — Pack offsets into a uint64 byte lane and vpaddq directly.** This fails for +any leaf where the cross-word cumulative count exceeds 255 (i.e., more than ~255 active +voxels in the preceding words — reachable for moderately dense leaves by the 4th word). +Only viable for very sparse leaves. + +**Approach #2 — Transpose to uint16_t prefixSums[8][8][8].** Unpack the byte-packed +`data[z][x].ui8[y]` into `uint16_t prefixSums[x][y][z]` (indexed [x][y][z] = linear order), +then add the 9-bit cross-word offsets in the wider format. Widening is safe; all values fit +in uint16_t (max = 512). The cost is a 3D index-permutation transpose +`(z,x,y) → (x,y,z)` on 64 bytes → 128 bytes. + +### 13h. Transposition Cost and Alternatives + +The output transpose (approach #2) is expensive in isolation: no loop ordering gives a +unit-stride inner loop for both source and destination simultaneously, so GCC cannot +auto-vectorize it. With explicit AVX2 intrinsics (8×8 byte matrix transpose per x-slice, +8 slices) the cost is ~200 instructions; even scalar it is ~512 operations on L1-resident +data (~400–800 cycles), dominating the ~14-cycle z+y passes. + +**Bit-transpose alternative**: pre-transpose the 8 input uint64_t words (64 bytes) instead +of post-transposing 512 uint16_t values (1024 bytes). The specific transposition that makes +the algorithm output naturally land in `[x][y][z]` memory order is: organize input as +`inputWords[y]` with bit `z*8+x` = B[x][y][z] (making y the word index, z the byte index, +and x the step variable). Transposing 64 bytes is intrinsically cheaper than transposing +1024 bytes, and the 8×8 bit-matrix transpose per y-slice is a well-studied ~10–15 instruction +operation. + +**Key tradeoff — good output order ↔ simple rectangle→linear fixup:** + +With the original layout (word=x, byte=y, step=z), the 2D rectangle is over (y, z) for fixed +x, and the rectangle→linear fixup collapses to the single byte-shift expression in §13f. + +With the bit-transposed layout (word=y, byte=z, step=x), the 2D rectangle is over (x, z) for +fixed y, and the "missing" terms for the linear prefix involve cross-word contributions from +all y-slices of preceding words — a significantly more complex expression that does not reduce +to a simple in-register byte operation. + +No 3D transposition of the input eliminates both costs simultaneously. The original layout +remains preferred for the simplicity of the fixup; the output transpose cost must be addressed +separately (either by tolerating it, using explicit intrinsics, or changing the consumer's +expected layout). + +--- + +## 14. Input Bit-Transpose: Decomposition and Implementation + +Although §13h concluded that the original layout (word=x) is preferred for fixup simplicity, +the input bit-transpose approach was further analyzed for completeness and because the output +transpose cost remains a concern. This section records the decomposition and implementation +decisions made during that analysis. + +### 14a. Target: y→z→x Layout + +To make the algorithm's output land naturally in `[x][y][z]` memory order (= NanoVDB linear +index order), the input words must be pre-arranged so that: + +``` +inputWords[y] bit (z*8 + x) = B[x][y][z] +``` + +i.e. word index = y (0..7), within-word byte index = z (0..7), within-byte bit index = x +(0..7). With this layout the algorithm's step variable becomes x, and +`data[x][y].ui8[z]` maps to voxel `(x, y, z)` — the standard linear-index order. + +This input transposition from the original NanoVDB layout (word=x, byte=y, bit=z) to the +target (word=y, byte=z, bit=x) is a 3-axis permutation of an 8×8×8 bit cube. It +decomposes into two independent transformations: + +1. **Step 1 — 8×8 byte-matrix transpose**: given `maskWords[x]` where byte y = B[x][y][:], + produce `tempWords[y]` where byte x = B[x][y][:]. (Only the byte-level arrangement + changes; bit ordering within each byte is unchanged.) + +2. **Step 2 — 8×8 bit-matrix transpose within each uint64**: given `tempWords[y]` where + byte x bit z = B[x][y][z], produce `inputWords[y]` where byte z bit x = B[x][y][z]. + (The byte→bit and bit→byte roles are swapped within each word.) + +### 14b. Step 2 — Bit-Matrix Transpose (Knuth 3-Round) + +This step is a standard 8×8 bit-matrix transpose applied independently to each of the 8 +uint64 words. The Knuth bit-interleaving algorithm uses three rounds of XOR/shift/AND: + +```cpp +static inline uint64_t transpose8x8bits(uint64_t x) +{ + // Round 1: swap 1×1 blocks at stride 1 within 2×2 tiles + uint64_t t = (x ^ (x >> 7)) & 0x00aa00aa00aa00aaULL; + x ^= t ^ (t << 7); + // Round 2: swap 2×2 blocks at stride 2 within 4×4 tiles + t = (x ^ (x >> 14)) & 0x0000cccc0000ccccULL; + x ^= t ^ (t << 14); + // Round 3: swap 4×4 blocks at stride 4 within 8×8 tiles + t = (x ^ (x >> 28)) & 0x00000000f0f0f0f0ULL; + x ^= t ^ (t << 28); + return x; +} +``` + +**Portability**: pure C++17, no intrinsics, no builtins. Under `#pragma omp simd` on 8 words +GCC emits ~36 scalar-width SIMD instructions (`vpsrlq`, `vpxor`, `vpand`, `vpsllq`) — fully +auto-vectorized. The 8 words are independent so there is no cross-element dependency. + +### 14c. Step 1 — Byte-Matrix Transpose (`__builtin_shufflevector`) + +The 8×8 byte-matrix transpose is a gather pattern: `tempWords[y]` byte x = byte y of +`maskWords[x]`. Compilers cannot auto-vectorize arbitrary gather patterns, so explicit +shuffle operations are required for SIMD throughput. + +On Clang, `__builtin_shufflevector` on `uint8_t __attribute__((vector_size(16)))` vectors +maps directly to architecture-appropriate byte-shuffle instructions (`vpunpcklbw`/`vpunpckhbw` +on x86, `vzip`/`vuzp` on ARM). On GCC, the equivalent is `__builtin_shuffle` with an integer +mask vector. Both builtins are already in the spirit of NanoVDB's existing use of +`__builtin_popcountll`, `__builtin_ctzl`, etc. in `nanovdb/util/Util.h`. + +The scalar fallback (64 independent byte moves) is branch-free and operates entirely on +L1-resident data — fast even without SIMD. + +Implementation with a scalar fallback and `NANOVDB_USE_INTRINSICS` guard (Clang path shown): + +```cpp +using u8x16 = uint8_t __attribute__((vector_size(16))); + +static void byteTranspose8x8(const uint64_t src[8], uint64_t dst[8]) +{ +#if defined(__clang__) && defined(NANOVDB_USE_INTRINSICS) + // Load 8 words as four 16-byte vectors (two words each) + u8x16 v01, v23, v45, v67; + __builtin_memcpy(&v01, src+0, 16); __builtin_memcpy(&v23, src+2, 16); + __builtin_memcpy(&v45, src+4, 16); __builtin_memcpy(&v67, src+6, 16); + + // Round 1: interleave bytes within each pair (vpunpcklbw / vpunpckhbw) + u8x16 t01 = __builtin_shufflevector(v01,v01, 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15); + u8x16 t23 = __builtin_shufflevector(v23,v23, 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15); + u8x16 t45 = __builtin_shufflevector(v45,v45, 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15); + u8x16 t67 = __builtin_shufflevector(v67,v67, 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15); + + // Round 2: gather 2-byte groups across pairs (vpunpcklwd / vpunpckhwd) + u8x16 q02lo = __builtin_shufflevector(t01,t23, 0,1,16,17, 2,3,18,19, 4,5,20,21, 6,7,22,23); + u8x16 q02hi = __builtin_shufflevector(t01,t23, 8,9,24,25,10,11,26,27,12,13,28,29,14,15,30,31); + u8x16 q46lo = __builtin_shufflevector(t45,t67, 0,1,16,17, 2,3,18,19, 4,5,20,21, 6,7,22,23); + u8x16 q46hi = __builtin_shufflevector(t45,t67, 8,9,24,25,10,11,26,27,12,13,28,29,14,15,30,31); + + // Round 3: gather 4-byte groups across quad-pairs (vpunpckldq / vpunpckhdq) + u8x16 r01 = __builtin_shufflevector(q02lo,q46lo, 0,1,2,3,16,17,18,19, 4,5,6,7,20,21,22,23); + u8x16 r23 = __builtin_shufflevector(q02lo,q46lo, 8,9,10,11,24,25,26,27,12,13,14,15,28,29,30,31); + u8x16 r45 = __builtin_shufflevector(q02hi,q46hi, 0,1,2,3,16,17,18,19, 4,5,6,7,20,21,22,23); + u8x16 r67 = __builtin_shufflevector(q02hi,q46hi, 8,9,10,11,24,25,26,27,12,13,14,15,28,29,30,31); + + __builtin_memcpy(dst+0,&r01,16); __builtin_memcpy(dst+2,&r23,16); + __builtin_memcpy(dst+4,&r45,16); __builtin_memcpy(dst+6,&r67,16); +#else + // Scalar fallback: 64 independent byte moves, L1-resident + const uint8_t* s = reinterpret_cast(src); + uint8_t* d = reinterpret_cast(dst); + for (int i = 0; i < 8; i++) + for (int j = 0; j < 8; j++) + d[i*8+j] = s[j*8+i]; +#endif +} +``` + +**GCC path**: replace `__builtin_shufflevector(a, b, ...)` with +`__builtin_shuffle((u8x16)(a), (u8x16)(b), (u8x16){...})` using the same index patterns. + +**AVX-512 note**: `__builtin_shufflevector` on 16-byte vectors emits fixed-width 128-bit +instructions. Unlike `#pragma omp simd` loops (which the compiler may promote to 256- or +512-bit), explicit `__builtin_shufflevector` calls on `vector_size(16)` remain 128-bit even +when targeting AVX-512. For AVX-512 width, 32-byte (`vector_size(32)`) vectors would be +needed, processing two 8-word groups per instruction. + +### 14d. Full Pipeline + +With the input bit-transpose in place, the complete algorithm for a single leaf becomes: + +``` +1. byteTranspose8x8(maskWords, tempWords) // Step 1: byte-matrix transpose +2. for y in 0..7: inputWords[y] = transpose8x8bits(tempWords[y]) // Step 2: bit-matrix transpose +3. computeZYPrefix(inputWords, data) // Z-pass + Y-pass (§13b/13c, step variable = x) +4. Rectangle→linear fixup (§13f formula, over (x,z) plane for fixed y — complexity TBD) +5. Zero-extend data[x][y].ui8[z] → uint16_t prefixSums[x*64 + y*8 + z] (vpmovzxbw, unit-stride) +6. Add mPrefixSum cross-word offsets // 8 groups × 64 uint16_t additions, auto-vectorizable +``` + +**Step 5 (zero-extension)**: with output naturally in `[x][y][z]` order, the zero-extension +from packed uint8_t to uint16_t is a unit-stride `vpmovzxbw` over 64 contiguous bytes — no +reordering, trivially auto-vectorizable. + +**Step 6 (cross-word offsets)**: add a constant (from `mPrefixSum`, up to 9 bits) to each of +the 8 groups of 64 uint16_t values — 8 broadcast-and-add SIMD operations, trivially +auto-vectorizable. + +### 14e. Open Question: Fixup Formula with y→z→x Layout + +The rectangle→linear fixup (step 4) is well-understood for the original layout (§13f): +with step variable = z, the 2D rectangle is over (y, z), and the correction is a simple +byte-shift within the same word. + +With the y→z→x layout and step variable = x, the 2D rectangle is over (x, z) within each +y-row. The "missing" contribution for the linear prefix consists of the complete earlier +y-rows (y' < y) at all z values — contributions that live in *different words* of `data`. +Whether this can be expressed as a comparably cheap byte-parallel operation on the register- +resident `data` array is pending analysis. + +### 14f. Cost Summary + +| Step | Cost (approx) | Portability | +|------|--------------|-------------| +| Step 1 (byte transpose) | ~12 shuffles (SIMD) / 64 moves (scalar) | `__builtin_shufflevector` (Clang) or `__builtin_shuffle` (GCC); scalar fallback | +| Step 2 (bit transpose) | ~36 SIMD ops (`#pragma omp simd`) | Pure C++17; auto-vectorized | +| Z-pass + Y-pass | ~14 cycles / ~56 SIMD ops | Pure C++17; auto-vectorized | +| Fixup | TBD (§14e) | TBD | +| Zero-extension | ~4 `vpmovzxbw` | Auto-vectorized | +| Cross-word offsets | ~8 broadcast+add | Auto-vectorized | + +--- + +## 15. Implementation Status and Next Steps (as of 2026-03-23) + +### 15a. Completed + +- **`nanovdb/util/Transpose.h`** — `transposeBits8x8` (Knuth 3-round, pure C++17, + `__hostdev__`) and `transposeBytes8x8` (3-round butterfly via `__builtin_shuffle` / + `__builtin_shufflevector` with scalar fallback). Both functions follow the + `nanovdb::util` free-function style of `Util.h`. Correctness verified by + `simd_test/transpose_test.cpp` on SIMD and scalar paths. Assembly inspected for + AVX2 (21 instructions) and AVX-512 (27 instructions, fixed 128-bit width). + +- **`simd_test/within_word_prefix_test.cpp`** — correctness test for `computeZYPrefix` + (z-pass + y-pass). Confirms the algorithm produces the 2D rectangle inclusive sum + `data[z][x].ui8[y]` at 100% accuracy; confirms ~74% discrepancy vs linear inclusive + prefix (as expected). + +- Input bit-transpose (`transposeBytes8x8` + `transposeBytes8x8`) maps `maskWords[x]` + (word=x, byte=y, bit=z) → `inputWords[y]` (word=y, byte=z, bit=x), so that the + z-pass + y-pass output lands naturally in `data[x][y].ui8[z]` = linear `x*64+y*8+z` + order. + +### 15b. Next Steps + +1. **Rectangle→linear fixup (§14e)** — work out the fixup formula for the y→z→x layout. + With step variable = x, the 2D rectangle is over (z, x) within each y-row; the + "missing" contribution (complete earlier y-rows at all z) comes from different words + of `data` and does not reduce to the simple byte-shift of §13f. This is the key + open design question before the pipeline is complete. + +2. **Zero-extension** — expand `data[x][y].ui8[z]` (byte-packed, 64 bytes) to + `uint16_t prefixSums[512]` in `x*64+y*8+z` linear order. Already in the correct + memory order after the input bit-transpose; trivially auto-vectorizable via + `vpmovzxbw`. + +3. **Cross-word offset addition** — add the 9-bit cumulative `mPrefixSum` offsets + (one per x-group of 64 voxels) to the uint16_t array. 8 broadcast-and-add + operations; trivially auto-vectorizable. + +4. **Inclusive→exclusive conversion** — subtract the active bit at each position + (`(maskWords[x] >> (y*8+z)) & 1`) to convert from inclusive to exclusive prefix, + matching `LeafData::getValue()` semantics. + +5. **End-to-end correctness test** — integrate all steps and verify against the + reference `getValue()` loop for random `Mask<3>` inputs. + +--- + +## 16. Plan #1 — x-major Layout (`data[x][y].ui8[z]`) + +An alternative to the original `data[z][x]` algorithm (§13) that keeps the +native x-y-z mask-word ordering, requires no input bit-transpose, and produces +the linear inclusive prefix sum **directly** (no rectangle→linear fixup). + +### 16a. Layout + +```cpp +union qword { uint64_t ui64; uint8_t ui8[8]; }; +qword data[8][8]; // data[x][y].ui8[z] ↔ voxel (x, y, z) +``` + +- `x` (0..7): word index — outer array dimension, slow index +- `y` (0..7): byte-within-word — inner array dimension +- `z` (0..7): bit-within-byte — **byte index** within the uint64 + +For fixed `x`: `data[x][0..7]` is 64 contiguous bytes (one cache line), enabling +`#pragma omp simd` over `y`. The byte index `z` lives *inside* each uint64, so +the Hillis-Steele within-uint64 scan naturally operates along `z`. + +### 16b. Algorithm + +``` +Step 1 — Indicator fill (scalar triple loop; optimize later): + data[x][y].ui8[z] = (maskWords[x] >> (y*8 + z)) & 1 = I[x][y][z] + +Step 2 — Z-pass: Hillis-Steele inclusive prefix sum over z within each uint64. + for x in 0..7: + for y in 0..7: ← simd-vectorizable (contiguous, no dep between y) + data[x][y].ui64 += data[x][y].ui64 << 8 + data[x][y].ui64 += data[x][y].ui64 << 16 + data[x][y].ui64 += data[x][y].ui64 << 32 + After: data[x][y].ui8[z] = Σ_{z'=0..z} I[x][y][z'] + Bonus: data[x][y].ui8[7] = full row y popcount (free). + +Step 3 — Y-pass: exclusive row-prefix scan + broadcast. + 3a. Extract row popcounts: + shifts[x][y].ui64 = data[x][y].ui64 >> 56 (byte 0 = row popcount, rest = 0) + + 3b. Exclusive y-prefix scan of shifts: + rowOffset[x][0] = 0 + rowOffset[x][y] = rowOffset[x][y-1] + shifts[x][y-1] for y = 1..7 + Sequential over y (loop-carried); independent over x — with a transposed + [y][x] layout the inner x-loop is unit-stride and AVX2/AVX-512-vectorizable. + + 3c+3d. Broadcast byte 0 to all 8 bytes and add: + data[x][y].ui64 += rowOffset[x][y].ui64 * kSpread + After: data[x][y].ui8[z] = Σ_{y'> 9*(x-1)) & 0x1FF for x = 1..7 + prefixSum[x*64 .. x*64+63] += xOffset[x] (broadcast + vpaddw, 4 AVX2 ops/slice) + After: prefixSum[i] = full linear inclusive prefix count within the leaf at voxel i. +``` + +### 16c. Why No Rectangle→Linear Fixup + +In the original `data[z][x]` algorithm (§13), the Y-pass accumulates a 2D +rectangle sum and then a separate fixup step (§13f) corrects it to a linear sum. +In Plan #1, the Y-pass adds **complete row popcounts** (`data[x][y].ui8[7]` from +the Z-pass) as a scalar broadcast. The scalar added to row `y` is exactly +`Σ_{y'::getValue() - mOffset` +(exclusive), subtract the active bit: + +```cpp +prefixSum[x*64 + y*8 + z] -= (maskWords[x] >> (y*8+z)) & 1u; +``` + +For the `decodeInverseMaps` use case (building `leafLocalOffsets[]`) the inclusive +form is equally usable; the choice depends on the consumer's convention. + +### 16e. Reference and Correctness + +```cpp +// Linear inclusive prefix at (x, y, z): +// Safe mask form: (2ULL << bitPos) - 1u covers bits 0..bitPos. +// At bitPos=63: unsigned wrap gives 0xFFFFFFFFFFFFFFFF. ✓ +uint16_t ref = xOffset[x] + countOn64(maskWords[x] & ((2ULL << (y*8+z)) - 1u)); +``` + +Verified in `simd_test/plan1_prefix_test.cpp`: 512000/512000 positions correct +across 1000 random `Mask<3>`-equivalent inputs. + +### 16f. Indicator Fill — `scatterLSB` Vectorization + +The original scalar triple loop (Step 1) is replaced by a multiply-free bit-scatter +that eliminates the inner `z`-loop: + +```cpp +static inline uint64_t scatterLSB(uint64_t src) +{ + uint64_t x = src & 0xFFu; + // Stage 1: replicate into 16-bit pairs. + // Multiplier 2^0+2^14+2^28+2^42 = (1+2^14)(1+2^28); x≤8 bits so OR≡ADD. + // Emits vpsllq+vpaddq pairs under AVX2/AVX-512 (no vpmuludq needed). + x = (x | (x << 14) | (x << 28) | (x << 42)) & UINT64_C(0x0003000300030003); + // Stage 2: separate each pair into individual byte lanes (1+2^7). + x = (x | (x << 7)) & UINT64_C(0x0101010101010101); + return x; +} + +// Indicator fill (Step 1) — replaces triple loop: +for (int x = 0; x < 8; x++) { + #pragma omp simd + for (int y = 0; y < 8; y++) + data[x][y].ui64 = scatterLSB(maskWords[x] >> (y * 8)); +} +``` + +`scatterLSB(maskWords[x] >> (y*8))` extracts byte `y` of word `x` and scatters +its 8 bits into the LSB of each of the 8 output bytes. The `y`-loop is independent +for fixed `x` and vectorizes under `#pragma omp simd`; the 8 outer `x`-iterations +are fully independent, allowing the OOO engine to interleave multiply chains and +hide shift latency. + +GCC applies two automatic strength reductions: +- Stage-1 factoring: `(1+2^14)(1+2^28)` is computed as two `vpsllq`+`vpaddq` + pairs (4 instructions) rather than 3 shifts + 3 ORs (6 instructions). +- Z-pass fusion: indicator fill and Z-pass are emitted as a single fused block + with no intervening store/reload. + +### 16g. AVX2 Performance + +Benchmarked in `simd_test/plan1_prefix_bench.cpp` and `simd_test/step_timing_bench.cpp` +(`rdtsc`-based, 50 000–100 000 iterations, 256-entry pre-generated input buffer). + +**Per-step breakdown (inlined, 100 000 iterations):** + +| Step | Description | Cycles/call | +|------|-------------|-------------| +| 1+2 | Indicator fill + Z-pass (fused) | 55.9 | +| 3a | Extract row popcounts (`>> 56`) | 26.8 | +| 3b | Exclusive y-prefix scan | 35.2 | +| 3c+d | Broadcast + add | 31.2 | +| 4+5 | Zero-extend + xOffset add | 47.0 | +| **Total** | | **196 cycles** | + +The `noinline` version (as seen by an external caller) measures **~377 cycles**, +with the ~180-cycle penalty attributable to 192 stack spills generated by AVX2's +16-register file being insufficient to keep all 16 YMM data blocks live +simultaneously. + +**Comparison with legacy software-popcount approach:** +512 independent `countOn(maskWords[x] & prefix_mask)` calls × ~15 instructions +(software Hamming weight, no hardware `popcnt`) ≈ 1 500–2 500 cycles. +Plan #1 delivers a **~10–15× speedup** without requiring a `popcnt` instruction. + +**AVX-512 note:** GCC emits YMM instructions even with `-march=sapphirerapids`, +missing the opportunity to process all 8 `y` values per word in a single ZMM +operation. It does replace the 3-step Z-pass chain with `vpmullq %ymm_kSpread` +(recognising that `(1+2^8)(1+2^16)(1+2^32) = kSpread`), saving 5 instructions +per block. Explicit `__m512i` intrinsics would be needed to unlock the full +ZMM path and eliminate register spills. + +--- + +## 17. 513-Entry Exclusive Prefix Layout and shfl_down Compaction + +### 17a. 513-Entry Exclusive Prefix Layout (initial design) + +The initial design allocates a 513-entry array, sets `prefixSums[0] = 0`, and passes +`prefixSums + 1` to `buildMaskPrefixSums`: + +```cpp +uint16_t prefixSums[513]; +prefixSums[0] = 0; +util::buildMaskPrefixSums(leaf.valueMask(), leaf.data()->mPrefixSum, prefixSums + 1); +``` + +Result after the call: +- `prefixSums[i]` = exclusive prefix at position i = 0-based rank of active voxel i. +- `prefixSums[i+1]` = inclusive prefix at position i (what buildMaskPrefixSums wrote). +- `prefixSums[512]` = total active voxel count of the leaf. + +For the shfl_down compaction, `shifts[i]` = number of inactive positions in [0..i-1]: +```cpp +shifts[i] = i - prefixSums[i] +``` +This was the approach used in the initial shfl_down implementation. See §17g for the +refined design that eliminates `prefixSums[]` and the explicit subtraction loop. + +### 17b. shfl_down Predicate -- Source vs. Destination + +Section 4e describes the predicate as `(shifts[i] & (1< +static void shflDownSep(const uint16_t* __restrict__ src, + const uint16_t* __restrict__ shifts, + uint16_t* __restrict__ dst) +{ + #pragma omp simd + for (int j = 0; j < 512 - Shift; j++) { + const uint16_t m = static_cast( + -static_cast((shifts[j + Shift] & static_cast(Shift)) != 0)); + dst[j] = (src[j + Shift] & m) | (src[j] & ~m); + } + for (int j = 512 - Shift; j < 512; j++) + dst[j] = src[j]; +} +``` + +**Arithmetic mask derivation**: `(shifts[j+Shift] & Shift) != 0` produces 0 or 1 (int). +Negating as int gives 0 or -1 = 0x00000000 or 0xFFFFFFFF. Truncating to uint16_t gives +0x0000 or 0xFFFF. The bitwise blend `(src[j+Shift] & m) | (src[j] & ~m)` then selects +the source or destination without a branch. GCC recognizes this as vpblendvb. + +**Critical CMake fix**: `#pragma omp simd` requires `-fopenmp` to be passed to the host +compiler. For CUDA source files, CMake does NOT automatically add `-Xcompiler -fopenmp` +even when `OpenMP::OpenMP_CXX` is linked. The CMakeLists.txt must explicitly set: +```cmake +$<$:-Xcompiler=-mavx2,-fopenmp> +``` +Without this, `#pragma omp simd` is treated as an unknown pragma and silently ignored, +causing the loop to compile as fully scalar code (~250ms vs ~15ms measured). + +### 17d. Full shfl_down Compaction (9 Passes, Ping-Pong Buffers) + +The 9 passes alternate between two buffers so each call has fully separate __restrict__ +source and destination pointers. buf0 is initialized with the identity; after 9 passes +(odd count), the result is in buf1. See §17g for the current form of the preamble that +builds `shifts[]`. + +```cpp +uint16_t buf0[512], buf1[512]; +for (int i = 0; i < 512; i++) buf0[i] = static_cast(i); + +shflDownSep< 1>(buf0, shifts, buf1); +shflDownSep< 2>(buf1, shifts, buf0); +shflDownSep< 4>(buf0, shifts, buf1); +shflDownSep< 8>(buf1, shifts, buf0); +shflDownSep< 16>(buf0, shifts, buf1); +shflDownSep< 32>(buf1, shifts, buf0); +shflDownSep< 64>(buf0, shifts, buf1); +shflDownSep<128>(buf1, shifts, buf0); +shflDownSep<256>(buf0, shifts, buf1); + +const uint16_t* leafLocalOffsets = buf1; +``` + +### 17e. Range Intersection and Output Fill + +The contiguous-copy approach (sections 3, 6, 7) replaces the per-voxel scatter. Two VBM +invariants simplify the early-exit (see §17g): +- No leaf has zero active voxels. +- Active voxel ranges across leaves are contiguous and monotonically ordered. + +These guarantee that no leaf in the iteration range is entirely before the block, so the +only guard needed is a `break` (not `continue`) when a leaf starts at or after the block end: + +```cpp +if (leafFirstOffset >= blockFirstOffset + BlockWidth) break; +``` + +Range intersection and output: +```cpp +const uint64_t globalStart = std::max(leafFirstOffset, blockFirstOffset); +const uint64_t globalEnd = std::min(leafFirstOffset + leafValueCount, + blockFirstOffset + BlockWidth); +const uint64_t jStart = globalStart - leafFirstOffset; +const uint64_t pStart = globalStart - blockFirstOffset; +const uint64_t count = globalEnd - globalStart; + +std::fill(leafIndex + pStart, leafIndex + pStart + count, (uint32_t)leafID); +std::copy(leafLocalOffsets + jStart, leafLocalOffsets + jStart + count, voxelOffset + pStart); +``` + +### 17f. Performance History + +**Performance (2M voxels / 16384 blocks / 25% occupancy / 24 OMP threads / AVX2):** +- Original `getValue()` loop: ~77 ms +- `buildMaskPrefixSums` + bit-scan scatter: ~65 ms +- shfl_down without vectorization (in-place, no __restrict__): ~250 ms +- shfl_down with proper vectorization (two-buffer __restrict__ + -fopenmp): ~15-20 ms +- After §17g refactor (buildMaskPrefixSums, no prefixSums[]): ~14-20 ms (no regression) + +**Key lessons:** +1. The in-place single-buffer form does NOT vectorize; two buffers + __restrict__ required. +2. `#pragma omp simd` requires -Xcompiler=-fopenmp in the CUDA host compile flags. +3. The arithmetic mask form (-(cond != 0)) is needed to avoid branch-vs-blend ambiguity. + +### 17g. Elimination of prefixSums[] via buildMaskPrefixSums + +`shifts[i]` = exclusive count of 0-bits (inactive voxels) at positions 0..i-1 is exactly +what `buildMaskPrefixSums` produces when run over the bitwise complement of the mask. +Adding a `template ` parameter to `buildMaskPrefixSums` allows +writing `shifts[]` directly, eliminating the `prefixSums[513]` array and the explicit +`shifts[i] = i - prefixSums[i]` subtraction loop: + +```cpp +uint16_t shifts[513]; +shifts[0] = 0; +util::buildMaskPrefixSums(leaf.valueMask(), leaf.data()->mPrefixSum, shifts + 1); + +const uint16_t leafValueCount = static_cast(512u) - shifts[512]; +``` + +Result: `shifts[i]` = exclusive 0-bit prefix at i for i=0..511 (used by `util::shuffleDownMask`). +`shifts[512]` = total inactive count; `leafValueCount` falls out as `512 - shifts[512]`. + +**How `buildMaskPrefixSums` works**: two changes from the default (`Invert=false`): +1. Step 1 (indicator fill): inverts the mask word before transposing: `~maskWords[x]`. +2. Step 5 (cross-word offsets): for word x, the exclusive 0-bit count equals + `64*x - ones`, where `ones` is the exclusive 1-bit count decoded from `mPrefixSum`. + The original (non-inverted) `mPrefixSum` field is passed unchanged by the caller. + +**Stack savings**: eliminates 1026 bytes (`prefixSums[513]`) and one 512-iteration pass. +The `shifts[513]` array (1026 bytes) replaces the old `shifts[512]` (1024 bytes) at +negligible cost (+2 bytes) while removing the need for `prefixSums[]` entirely. + +### 17h. shflDown → util::shuffleDownMask; generalize and promote to utility + +The single-buffer `shflDown` (introduced in the §17d revision) is generalized and +promoted to a free function `nanovdb::util::shuffleDownMask` in `VoxelBlockManager.h` +(a candidate for a future `nanovdb/util/Algo.h`). The final signature: + +```cpp +template +inline void shuffleDownMask(DataT* NANOVDB_RESTRICT data, + const MaskT* NANOVDB_RESTRICT masks, + MaskT maskBits) +{ + static_assert(Shift > 0 && Shift < N, "Shift must satisfy 0 < Shift < N"); + static_assert(std::is_unsigned_v, "DataT must be an unsigned integer type"); + static_assert(std::is_unsigned_v, "MaskT must be an unsigned integer type"); + #pragma omp simd + for (int j = 0; j < N - Shift; j++) { + const DataT m = (masks[j + Shift] & maskBits) != 0 ? ~DataT{0} : DataT{0}; + data[j] = (data[j + Shift] & m) | (data[j] & ~m); + } +} +``` + +**Changes from the §17d single-buffer form**: +- `shifts` → `masks` (parameter name); `& Shift` → `& maskBits` (runtime parameter, no + default — caller is explicit). `maskBits` is always a literal at every call site so + the compiler constant-folds it. +- Generalized from `uint16_t` to `DataT` / `MaskT` (any unsigned integer types). +- Generalized from hardcoded `512` to template parameter `N`. +- Blend mask uses `~DataT{0}` / `DataT{0}` (all-ones / all-zeros of the correct width) + instead of the `-static_cast(bool)` trick — clearer and equally efficient. +- `static_assert` guards on `Shift > 0 && Shift < N` and unsigned types. +- Extracted from `VoxelBlockManager` private static → `nanovdb::util` free function. +- `NANOVDB_RESTRICT` replaces `__restrict__` for portability (MSVC uses `__restrict`). +- Renamed `shflDown` → `shuffleDownMask`: "shuffle down" follows the CUDA + `__shfl_down_sync` convention; "Mask" denotes the predicate table parameter. + The operation is a conditional fixed-distance gather from higher-indexed positions, + not an arbitrary permutation — "shuffle" is preferred over "shift" because of the + conditional, data-dependent nature of the movement. + +**Call site**: +```cpp +util::shuffleDownMask<512, 1>(buf, shifts, uint16_t{ 1}); +// ... through ... +util::shuffleDownMask<512, 256>(buf, shifts, uint16_t{256}); +``` +No behavioral change from §17d. + +**Future consideration — header migration**: `shuffleDownMask` currently lives in +`VoxelBlockManager.h` because that is where it was first needed. It is a generic, +VBM-independent primitive with no dependency on any VBM type. A natural future home +is a dedicated utility header such as `nanovdb/util/Algo.h` (does not yet exist) or +`nanovdb/util/Util.h`. Migration should be deferred until at least one second call +site emerges outside the VBM, to avoid premature abstraction. diff --git a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/VBMImplementationKnowledge.md b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/VBMImplementationKnowledge.md new file mode 100644 index 0000000000..2aa902e7be --- /dev/null +++ b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/VBMImplementationKnowledge.md @@ -0,0 +1,269 @@ +# VBM Implementation Knowledge Base + +This document captures non-obvious design decisions, rejected alternatives, invariants, +and performance phenomenology for the VoxelBlockManager (VBM) subsystem and its CPU +port of `decodeInverseMaps`. It is written as dense, structured, agent-consumable +facts rather than narrative. It complements `VoxelBlockManagerContext.md` (semantics +and API) and `DecodeInverseMapsCPUPlan.md` (algorithm design journal). + +--- + +## 1. NanoVDB Leaf Mask Layout — Ambient Space + +A `Mask<3>` stores 512 bits as 8 `uint64_t` words (`maskWords[x]`, x = 0..7). +The NanoVDB linear voxel index within a leaf is `i = x*64 + y*8 + z`, where: +- x = word index (0..7) +- y = byte index within the word (0..7) — bits 8y..8y+7 of `maskWords[x]` +- z = bit index within the byte (0..7) + +**Invariant**: word x covers leaf-local positions x*64 .. x*64+63. The bit at +position `y*8+z` within word x corresponds to leaf-local index `x*64 + y*8 + z`. + +**Why this matters**: every algorithm that operates on the mask must respect this +layout. The "fast" dimension (z, innermost) is the bit dimension within a byte. +The "slow" dimension (y) is the byte dimension within a word. The "word" dimension +(x) is independent across the 8 words. + +--- + +## 2. mPrefixSum Encoding + +`LeafIndexBase::mPrefixSum` (a `uint64_t`) encodes 7 exclusive cumulative popcounts +at the word boundaries, packed as 7 x 9-bit fields: + + bits 9*(x-1) .. 9*(x-1)+8 = total active voxels in words 0..x-1, x = 1..7 + +- Field x-1 is the **exclusive** prefix at word x — i.e., how many active voxels + precede word x in the leaf. +- xOffset[0] = 0 always (implicit, not stored). +- Maximum value per field: 512 (all 512 voxels in preceding words active) — fits in + 9 bits. +- **Precondition for buildMaskPrefixSums**: the caller must pass the leaf's own + `mPrefixSum` field, not a recomputed value. The field is set during grid + construction and is authoritative. + +--- + +## 3. buildMaskPrefixSums — Output Semantics + +`util::buildMaskPrefixSums(mask, prefixSum, offsets[512])` produces: + + offsets[i] = number of active voxels at leaf-local positions 0..i (inclusive) + +- Output is **inclusive** and **1-based**: for an active voxel at position i, + `offsets[i]` is its 1-based rank among all active voxels in the leaf. +- Output is **leaf-local**: cross-word offsets from `mPrefixSum` are folded in, + but the global leaf-first-offset (`leaf.data()->firstOffset()`) is NOT added. + `offsets[i]` is in range [1, 512]. +- For an **inactive** voxel at position i: `offsets[i] == offsets[i-1]` (no + increment). These values are valid and load-bearing for the `shfl_down` + compaction approach (see §6). +- To recover the **exclusive** (0-based) rank of an active voxel at i: + `offsets[i] - 1`. +- To recover the **global sequential index** of an active voxel at position i: + `leafFirstOffset - 1 + offsets[i]` + where `leafFirstOffset = leaf.data()->firstOffset()` is the 1-based global index + of the leaf's first active voxel. + +--- + +## 4. transposeByteRow — What It Does and Why + +`transposeByteRow(src)` treats the low 8 bits of `src` as the first row of an 8x8 +bit matrix and returns the result of transposing it: + + output.ui8[z] = (src >> z) & 1 for z = 0..7 + +It is the single-row specialization of `transposeBits8x8`: if only the first row of +an 8x8 bit matrix is non-zero, the full transpose reduces to this operation. + +**Why it is used in buildMaskPrefixSums (Step 1 — indicator fill)**: +The algorithm needs `data[x][y].ui8[z] = indicator(x, y, z)` — a 0/1 value per +(x, y, z) triple. The indicator for (x, y, z) is bit `y*8+z` of `maskWords[x]`, +which is bit z of byte y of `maskWords[x]`. `transposeByteRow(maskWords[x] >> (y*8))` +extracts byte y and places bit z into byte z of the result — exactly the required +indicator layout. + +**Equivalent hardware instruction**: `_pdep_u64(src & 0xFF, kSpread)` on x86, where +`kSpread = 0x0101010101010101`. The software implementation is used for portability. + +**Why not `& kSpread` (the simpler alternative)**: `(maskWords[x] >> z) & kSpread` +would extract bit z from each byte — which is what Plan #2 needed (data[x][z] layout). +Plan #1 uses data[x][y] layout, requiring the byte dimension to be the output axis, +not the input axis. See §5 for the Plan #1 vs Plan #2 decision. + +--- + +## 5. Plan #1 vs Plan #2 — The Rejected Alternative + +**Plan #2** would have used layout `data[x][z].ui8[y]`: +- Simpler indicator fill: `data[x][z].ui64 = (maskWords[x] >> z) & kSpread` + (just a shift and mask — no `transposeByteRow` needed). +- After the z-pass (Hillis-Steele within-uint64 over y), the output is in + `data[x][z]` order, not `data[x][y]` order. +- **Fatal cost**: before zero-extending to uint16_t, a `transposeBytes8x8` call + per x-slice is required to reorder from `data[x][z].ui8[y]` to the required + linear output order. This is ~200 instructions per call x 8 calls = ~1600 + instructions of transpose overhead, dominating the ~14-cycle z+y passes. + +**Plan #1** (chosen) uses layout `data[x][y].ui8[z]`: +- Indicator fill requires `transposeByteRow` — slightly more expensive than `& kSpread`. +- After the z-pass and y-pass, the output is already in linear order (`x*64 + y*8 + z`). +- **No output transpose**: zero-extension to uint16_t is a straight vpmovzxbw over + contiguous memory. + +**Decision criterion**: Plan #1 eliminates the expensive output transpose at the cost +of a cheaper input transformation. The output transpose (1024 bytes) is intrinsically +more expensive than the input transformation (64 bytes). + +--- + +## 6. Compaction Approaches — Inclusive vs Exclusive, Active vs Inactive + +Two approaches exist for building `leafLocalOffsets[j]` (local position of j-th active +voxel) from `offsets`: + +**Scatter approach** (simple, used in decodeInverseMaps): + For each active position i: `leafLocalOffsets[offsets[i] - 1] = i` + Only requires `offsets[i]` at active positions. Inactive positions are not read. + +**shfl_down approach** (deeper SIMD, see DecodeInverseMapsCPUPlan §4e): + Requires `shifts[i] = i - (offsets[i] - isActive(i))` for ALL positions, active + and inactive. The `move[i]` predicate for each pass depends on `shifts[i]` even + for inactive voxels. `buildMaskPrefixSums` correctly provides values at all 512 + positions for this purpose. + +**Key invariant**: `offsets` from `buildMaskPrefixSums` is valid at all 512 positions, +not just active ones. This is intentional and necessary for the shfl_down path. + +--- + +## 7. decodeInverseMaps CPU — Current Implementation and Performance History + +**Current implementation** (`VoxelBlockManager.h`, branch `vbm-cpu-port`): +For each leaf overlapping the block: +1. Early-exit `break` if `leafFirstOffset >= blockFirstOffset + BlockWidth` (monotonicity + invariant: all subsequent leaves also fall outside the block). +2. Build `shifts[513]` directly via `buildMaskPrefixSums`: `shifts[0]=0`, + `buildMaskPrefixSums(..., shifts+1)` writes inclusive 0-bit counts into + `shifts[1..512]`, giving `shifts[i]` = exclusive count of inactive voxels in [0..i-1]. + `leafValueCount = 512 - shifts[512]` as a free by-product. +3. Run 9 in-place shuffle-down passes (Shift=1,2,4,...,256) via `util::shuffleDownMask` with a single buffer. +4. Range fill `leafIndex[pStart..pEnd)` and contiguous copy from `leafLocalOffsets`. + +**Eliminated from earlier design**: the separate `prefixSums[513]` array and the explicit +`shifts[i] = i - prefixSums[i]` subtraction loop. `buildMaskPrefixSums` writes +`shifts[]` directly in one pass by running the prefix sum over the inverted mask words and +adjusting the cross-word offsets as `64*x - ones` instead of `ones`. + +**Performance history (2M voxels / 16384 blocks / 25% occupancy / 24 OMP threads / AVX2)**: + +All figures below are wall-clock time for the full `decodeInverseMaps` loop over all +16384 blocks. Entries marked *unoptimized* were measured with `CMAKE_BUILD_TYPE` unset +(effectively `-O0` host compilation via NVCC); entries marked *Release* used +`-DCMAKE_BUILD_TYPE=Release` (`-O3 -DNDEBUG`). + +- Original `getValue()` loop (unoptimized): ~77 ms +- `buildMaskPrefixSums` + bit-scan scatter (unoptimized): ~65 ms (~15% improvement) +- `shuffleDownMask` without vectorization (-fopenmp missing in CUDA host flags, unoptimized): ~250 ms +- `shuffleDownMask` with vectorization (two-buffer `__restrict__`, -fopenmp fixed, unoptimized): ~15-20 ms +- `shuffleDownMask` single-buffer + `buildMaskPrefixSums` (unoptimized): ~15-17 ms (no regression) +- `shuffleDownMask` single-buffer + `buildMaskPrefixSums` (**Release**): **~0.9-1.6 ms (~15x speedup)** + +**Critical finding — build type**: with `CMAKE_BUILD_TYPE` unset, NVCC compiles host +code without optimization. Template functions (`shuffleDownMask`, `buildMaskPrefixSums`) +are not inlined, the `#pragma omp simd` loops are not vectorized, and all locals are +spilled to the stack. The resulting ~15-17 ms figure is not representative of real +performance. Always benchmark with `-DCMAKE_BUILD_TYPE=Release`. + +**Critical finding — -fopenmp**: `#pragma omp simd` is silently ignored for CUDA host +code unless `-Xcompiler -fopenmp` is explicitly added. Linking `OpenMP::OpenMP_CXX` +does NOT automatically propagate compile flags to CUDA sources via CMake. Without +-fopenmp, the shuffle-down passes compile as scalar loops. + +**Why shuffleDownMask beats bit-scan with vectorization**: +- Bit-scan is inherently scalar (data-dependent BSF/BSR instruction, variable trip count). +- shuffleDownMask's 9 passes are fixed-width, data-independent loops over 512 elements. +- With AVX2 (16 uint16_t per register), each pass takes ~32 vector ops vs ~128 scalar ops + for the bit-scan at 25% occupancy. + +**Recommended CMake invocation** (from the `build/` directory): +```bash +cmake .. \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CUDA_ARCHITECTURES=89 \ + -DOpenVDB_ROOT=/home/esifakis/local \ + -DNANOVDB_USE_CUDA=ON \ + -DNANOVDB_USE_OPENVDB=ON \ + -DNANOVDB_USE_TBB=ON \ + -DNANOVDB_BUILD_EXAMPLES=ON \ + -DNANOVDB_BUILD_UNITTESTS=ON \ + -DNANOVDB_BUILD_TOOLS=ON +``` + +--- + +## 8. Critical Portability Notes + +**`UINT64_C` vs `UL` suffix**: always use `UINT64_C(...)` for 64-bit hex constants. +`UL` is 32 bits on MSVC/Windows. Occurrences of `UL`-suffixed 64-bit constants in +`MorphologyHelpers.h` were corrected to `UINT64_C`. + +**`__restrict__` portability**: `__restrict__` (GCC/Clang) vs `__restrict` (MSVC). +NanoVDB has no existing C++ portability macro for this. `CNanoVDB.h` defines +`RESTRICT` but only for the C API. A `NANOVDB_RESTRICT` macro should be added +if `__restrict__` is used in host-only headers. + +**`#pragma omp simd` is safe without OpenMP**: unknown pragmas are silently ignored +by standard-conforming C++17 compilers. All major compilers recognize the `omp` +namespace. The pragma is present in `buildMaskPrefixSums` for portability; GCC 13.3 +auto-vectorizes the loops correctly even without it. + +**`#pragma omp simd` defeated by hardware POPCNT**: under `-mavx2`, GCC replaces +software Hamming-weight expressions with scalar `popcntl` and ignores the simd pragma. +The `popcount32` + `#pragma omp simd` approach in `vbm_host_cuda_kernels.cu` (sanity +bench) requires `-mno-popcnt` to vectorize, which is unsuitable for production. +`buildMaskPrefixSums` avoids this by not using popcount at all. + +--- + +## 9. No __hostdev__ on buildMaskPrefixSums — Deliberate Decision + +`buildMaskPrefixSums` is CPU-only. A CUDA equivalent would be organized around the +32-thread warp (using `__ballot_sync`, warp-level prefix intrinsics, or cooperative +group reductions) and would look fundamentally different. Marking it `__hostdev__` +would be misleading about intended usage and would invite incorrect porting. + +The CUDA `decodeInverseMaps` already has its own highly optimized implementation. +The CPU and CUDA decode paths are expected to remain separate implementations +indefinitely. + +--- + +## 10. Two Essential VBM Invariants (Not Enforced by NanoVDB Itself) + +These invariants are not structurally guaranteed by the NanoVDB data format, but all +reasonable ways of constructing a VBM-compatible grid enforce them, and the VBM +algorithms depend on them for correctness and for simplifying early-exit logic. + +**Invariant A — No empty leaves**: every leaf node in a VBM grid has at least one active +voxel (`leafValueCount > 0`). This is essential for the JumpMap encoding to give an +unambiguous leafID per block: a leaf with zero active voxels would contribute nothing to +the sequential voxel index but would still occupy a slot in the leaf array, breaking the +1-to-1 mapping between sequential voxel ranges and leaf IDs. + +**Invariant B — Monotonically non-decreasing voxelID → leafID mapping, step ≤ 1**: the +global sequential active voxel index increases monotonically across leaves, and consecutive +leaves' active voxel ranges are contiguous (no gaps, no overlaps). Formally: + `leafFirstOffset[k+1] = leafFirstOffset[k] + leafValueCount[k]` +This means active voxel ranges partition the global index space without gaps. + +**Consequences for `decodeInverseMaps`**: +- The check `leafValueCount == 0` is dead code and can be omitted (Invariant A). +- The check `leafFirstOffset + leafValueCount <= blockFirstOffset` (leaf entirely before + block) is impossible for any leaf in the iteration range, given that `firstLeafID` is + chosen to be the leaf containing `blockFirstOffset` (Invariant B + correct firstLeafID). +- When `leafFirstOffset >= blockFirstOffset + BlockWidth` fires for one leaf, it holds for + all subsequent leaves (Invariant B), so `break` is correct instead of `continue`. +- The early-exit collapses to a single line: + `if (leafFirstOffset >= blockFirstOffset + BlockWidth) break;` diff --git a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/VoxelBlockManagerContext.md b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/VoxelBlockManagerContext.md new file mode 100644 index 0000000000..1ef2ab9bfb --- /dev/null +++ b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/VoxelBlockManagerContext.md @@ -0,0 +1,259 @@ +# VoxelBlockManager: Context and Design Guide + +This document captures the intent, semantics, and usage patterns of the +VoxelBlockManager (VBM) acceleration structure for NanoVDB OnIndex grids. +It is intended to give a complete mental model to anyone (human or AI assistant) +working on VBM development or writing SIMT kernels that consume it. + +--- + +## What problem does the VBM solve? + +A NanoVDB `ValueOnIndex` (OnIndex) grid assigns a unique sequential integer +index to each active voxel. These indices are dense: if the grid has N active +voxels, they are numbered 1..N (the index 0 is reserved). This sequential +layout enables efficient SIMT parallelism: a GPU kernel can launch one thread +per active voxel, with thread k processing voxel k. + +The challenge is *decoding*: given a voxel's sequential index, how do you find +its 3D coordinates and leaf node? The index space is flat but the tree is +hierarchical. Scanning the tree for each voxel is too expensive. + +The VBM solves this by precomputing two small metadata arrays that let any +thread, given only its sequential index, quickly determine which leaf node +contains that voxel and where within the leaf it lives. This decode is done +cooperatively at thread-block granularity (one thread block per "voxel block"), +using shared memory. + +--- + +## Core concepts + +### Voxel blocks + +The VBM partitions the active voxel index space into contiguous spans of +`BlockWidth` voxels called *voxel blocks*. `BlockWidth` is a compile-time +power of two (typically 128). Block `b` covers sequential indices: + + [firstOffset + b * BlockWidth, firstOffset + (b+1) * BlockWidth - 1] + +A GPU kernel processes one voxel block per thread block, with `BlockWidth` +threads. Thread `t` in block `b` is responsible for voxel index +`firstOffset + b * BlockWidth + t`. + +### firstOffset and lastOffset + +`firstOffset` is the base of the VBM's index range. It must satisfy +`firstOffset == 1 (mod BlockWidth)` (i.e., it is "block-aligned"). For a +single-GPU build covering the full grid, `firstOffset = 1` and +`lastOffset = activeVoxelCount()`. + +In a **multi-GPU** setting, each rank owns a contiguous slice of the active +voxel index space. Rank r uses `firstOffset_r` and `lastOffset_r` to build a +VBM covering only its slice, even though all ranks hold a copy of the full +grid topology. The VBM metadata is then sized and indexed relative to each +rank's own `firstOffset`. + +### blockCount + +`blockCount` is the **allocated capacity** of the VBM metadata buffers. It +must be >= `ceil((lastOffset - firstOffset + 1) / BlockWidth)` but may be +larger. This allows pre-allocating a larger handle and rebuilding in-place +for a range that grows over time, without reallocating. + +--- + +## Metadata arrays + +### firstLeafID (uint32_t[blockCount]) + +`firstLeafID[b]` is the index of the **first leaf node** that overlaps voxel +block `b`. A leaf overlaps block `b` if any of its active voxels fall in +`[firstOffset + b*BlockWidth, firstOffset + (b+1)*BlockWidth - 1]`. + +In a sequential NanoVDB grid, leaf nodes are laid out contiguously in memory +in ascending order of their first active voxel index. So `firstLeafID[b]` is +the smallest leaf index `i` such that leaf `i` has at least one active voxel +in block `b`. + +### jumpMap (uint64_t[blockCount * JumpMapLength]) + +`JumpMapLength = BlockWidth / 64`. The jumpMap for block `b` is a bitfield of +`BlockWidth` bits (stored as `JumpMapLength` uint64_t words) where bit `p` is +set if and only if a new leaf node **begins** at position `p` within block `b` +(i.e., some leaf's first active voxel has sequential index +`firstOffset + b*BlockWidth + p`, and `p > 0`). + +Bit 0 is never set (the leaf starting exactly at the block boundary is +recorded in `firstLeafID`, not the jumpMap). + +Together, `firstLeafID[b]` and the jumpMap for block `b` enumerate all leaf +nodes that overlap block `b`: start at `firstLeafID[b]`, then count the set +bits in the jumpMap to find how many additional leaves follow. + +--- + +## Build API + +### Handle: VoxelBlockManagerHandle + +A pure data holder. Owns two buffers (`firstLeafID` and `jumpMap`) and stores +`blockCount`, `firstOffset`, `lastOffset`. No allocation or build logic is in +the handle itself. A default-constructed handle is the canonical "null/empty" +state (`blockCount == 0`). + +Accessors: +- `blockCount()`, `firstOffset()`, `lastOffset()` +- `hostFirstLeafID()` / `hostJumpMap()` -- host-side pointers +- `deviceFirstLeafID()` / `deviceJumpMap()` -- device-side pointers (only when + BufferT has a device dual, e.g. `cuda::DeviceBuffer` or a unified buffer) + +### Two-overload pattern + +Both the CPU (`nanovdb::tools::`) and CUDA (`nanovdb::tools::cuda::`) build +functions follow the same two-overload pattern, mirroring the NodeManager +convention in NanoVDB: + +**Allocating overload** -- returns a new, fully-constructed handle: + + // CPU + auto handle = nanovdb::tools::buildVoxelBlockManager(grid); + + // CUDA + auto handle = nanovdb::tools::cuda::buildVoxelBlockManager(d_grid, stream); + +Optional parameters (with sentinel 0 meaning "derive from grid"): +- `firstOffset` -- defaults to 1 +- `lastOffset` -- defaults to `activeVoxelCount()` (CPU) or read from device + via `DeviceGridTraits::getActiveVoxelCount()` (CUDA) +- `nBlocks` -- defaults to `ceil((lastOffset - firstOffset + 1) / BlockWidth)` + +If `lastOffset < firstOffset` (e.g., empty grid), a default-constructed null +handle is returned immediately with no allocation attempted. + +**Rebuild-in-place overload** -- takes a pre-allocated handle by reference, +zeroes the jumpMap, and recomputes both arrays. No allocation: + + // CPU + nanovdb::tools::buildVoxelBlockManager(grid, handle); + + // CUDA + nanovdb::tools::cuda::buildVoxelBlockManager(d_grid, handle, stream); + +A null handle (`blockCount == 0`) is silently ignored (no-op). This overload +is the right choice for benchmarking or for rebuilding after a +topology-preserving update without paying allocation cost. + +The allocating overload delegates to the rebuild overload after allocating +buffers -- no logic duplication. + +--- + +## CPU vs GPU implementation asymmetry + +### GPU: launch at lower-node granularity + +A NanoVDB level-1 internal node ("lower node") has up to 4096 leaf child +slots (16^3). The GPU kernel launches one CTA per lower node (subdivided +into `SlicesPerLowerNode = 8` slices for additional parallelism), so each +thread handles approximately one leaf child slot per iteration. Threads check +the lower node's `childMask` to skip empty slots. + +This grouping is chosen because: +- It naturally sizes the CTA workload (4096 slots / 8 slices / 128 threads = 4 slots per thread) +- Threads in the same warp access leaves from the same lower node, improving + memory access locality +- The grid is `<<>>` + +The cost is wasted threads for sparse lower nodes (few active leaf children +out of 4096 slots). This is an acceptable trade-off on the GPU. + +### CPU: iterate leaves directly + +On the CPU there is no benefit to the lower-node grouping. The build uses +`std::for_each(std::execution::par, firstLeaf, firstLeaf + leafCount, ...)`, +iterating directly over the flat contiguous leaf array. Each task processes +exactly one leaf -- no child mask checks, no wasted iterations. + +Leaf index is computed by pointer arithmetic: `&leaf - firstLeaf`. + +### Why firstLeafID writes are race-free + +In a sequential NanoVDB grid, leaf offset ranges are non-overlapping and +ordered. A leaf that spans from block `a` to block `a+k` (max k=3 for +BlockWidth=128 and leaves with <=512 active voxels) "backward-fills" +`firstLeafID[a+1..a+k]`. No other leaf can start in a block before `a+k` +without its offset range overlapping leaf `a`'s range -- which is impossible. +Hence at most one leaf writes each `firstLeafID[b]` entry, so the writes +are non-atomic. + +The jumpMap writes, by contrast, require atomic OR because multiple leaves +from different parts of the tree can start at positions within the same block. + +--- + +## Kernel usage pattern (SIMT consumer) + +The typical VBM-powered kernel: + + __global__ void myKernel( + NanoGrid* grid, + const uint32_t* firstLeafID, + const uint64_t* jumpMap, + uint64_t firstOffset, uint64_t lastOffset, uint32_t nBlocks) + { + __shared__ uint32_t smem_leafIndex[BlockWidth]; + __shared__ uint16_t smem_voxelOffset[BlockWidth]; + + int blockID = blockIdx.x; + uint64_t blockFirstOffset = firstOffset + (uint64_t)blockID * BlockWidth; + + // Cooperative decode: all threads in the block participate + VoxelBlockManager::decodeInverseMaps( + grid, + firstLeafID[blockID], + &jumpMap[blockID * VoxelBlockManager::JumpMapLength], + blockFirstOffset, + smem_leafIndex, + smem_voxelOffset); + // smem_leafIndex[t] and smem_voxelOffset[t] now hold the leaf index + // and intra-leaf voxel offset for thread t's voxel. + // Entries beyond lastOffset are filled with UnusedLeafIndex / UnusedVoxelOffset. + + int t = threadIdx.x; + uint64_t globalIndex = blockFirstOffset + t; + if (globalIndex > lastOffset) return; + if (smem_leafIndex[t] == VoxelBlockManager::UnusedLeafIndex) return; + + // From here: access the voxel's 3D position, values, or stencil. + // VoxelBlockManager::computeBoxStencil(...) uses the same + // smem arrays to look up the 27 stencil neighbor indices. + } + + // Launch: one block per VBM block + myKernel<<>>( + d_grid, + vbmHandle.deviceFirstLeafID(), + vbmHandle.deviceJumpMap(), + vbmHandle.firstOffset(), + vbmHandle.lastOffset(), + (uint32_t)vbmHandle.blockCount()); + +Key points: +- `decodeInverseMaps` must be called by ALL threads in the block + (it uses `__syncthreads` internally). Do not call from divergent threads. +- `computeBoxStencil` does NOT synchronize and may be called per-thread. +- Voxels in the last partial block beyond `lastOffset` get sentinel values + (`UnusedLeafIndex = 0xffffffff`, `UnusedVoxelOffset = 0xffff`); always + guard with a `globalIndex <= lastOffset` check. + +--- + +## Files + +- `nanovdb/tools/VoxelBlockManager.h` -- Handle class, CPU build functions +- `nanovdb/tools/cuda/VoxelBlockManager.cuh` -- VoxelBlockManager device struct, + CUDA build functions +- `nanovdb/examples/ex_voxelBlockManager_host_cuda/` -- end-to-end example, + benchmarks CPU vs GPU build time, validates CPU/GPU metadata agreement, + validates full inverse map against grid structure diff --git a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/vbm_host_cuda.cpp b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/vbm_host_cuda.cpp new file mode 100644 index 0000000000..2ebc9370e0 --- /dev/null +++ b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/vbm_host_cuda.cpp @@ -0,0 +1,96 @@ +// Copyright Contributors to the OpenVDB Project +// SPDX-License-Identifier: Apache-2.0 + +/*! + \file vbm_host_cuda.cpp + + \brief Test harness for the VoxelBlockManager (CUDA reference implementation). + + Generates a random sparse domain at a configurable occupancy level using a + Morton-curve layout, builds a ValueOnIndex NanoVDB grid, constructs the + VoxelBlockManager on the GPU, decodes the full inverse map (leafIndex, + voxelOffset per active voxel), and validates the result on the host. + + Usage: vbm_host_cuda [ambient_voxels [occupancy]] + ambient_voxels Total universe of voxel positions (default: 1048576) + occupancy Fraction of positions that are active in [0,1] (default: 0.5) +*/ + +#include + +#include +#include +#include +#include +#include + +void runVBMCudaTest(const std::vector& coords); + +/// @brief Unpack one component of a Morton-encoded index into a coordinate. +/// Keeps every third bit of the input, then packs them into a contiguous integer. +static uint32_t coordinate_bitpack(uint32_t x) +{ + x &= 0x49249249; + x |= (x >> 2); x &= 0xc30c30c3; + x |= (x >> 4); x &= 0x0f00f00f; + x |= (x >> 8); x &= 0xff0000ff; + x |= (x >> 16); x &= 0x0000ffff; + return x; +} + +/// @brief Generate active voxel coordinates at the requested occupancy level. +/// Voxels are drawn uniformly at random from a Morton-curve layout over +/// ambient_voxels positions, giving spatially coherent 3D coordinates. +static std::vector +generateDomain(int ambient_voxels, float occupancy, uint32_t seed = 42) +{ + const int target = (int)(occupancy * (float)ambient_voxels); + + std::mt19937 rng(seed); + std::uniform_int_distribution dist(0, ambient_voxels - 1); + + std::vector voxmap(ambient_voxels, false); + int active = 0; + while (active < target) { + int i = dist(rng); + if (!voxmap[i]) { voxmap[i] = true; ++active; } + } + + std::vector coords; + coords.reserve(active); + for (int i = 0; i < ambient_voxels; ++i) { + if (voxmap[i]) { + coords.emplace_back( + (int)coordinate_bitpack( i & 0x49249249), + (int)coordinate_bitpack((i >> 1) & 0x49249249), + (int)coordinate_bitpack((i >> 2) & 0x49249249)); + } + } + return coords; +} + +int main(int argc, char** argv) +{ + try { + int ambient_voxels = 8 * 1024 * 1024; + float occupancy = 0.25f; + + if (argc > 1) ambient_voxels = std::stoi(argv[1]); + if (argc > 2) occupancy = std::stof(argv[2]); + + occupancy = std::max(0.0f, std::min(1.0f, occupancy)); + + std::cout << "ambient_voxels = " << ambient_voxels << "\n" + << "occupancy = " << occupancy << "\n"; + + auto coords = generateDomain(ambient_voxels, occupancy); + std::cout << "Active voxels generated: " << coords.size() << "\n"; + + runVBMCudaTest(coords); + } + catch (const std::exception& e) { + std::cerr << "An exception occurred: \"" << e.what() << "\"\n"; + return 1; + } + return 0; +} diff --git a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/vbm_host_cuda_kernels.cu b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/vbm_host_cuda_kernels.cu new file mode 100644 index 0000000000..9070bc38d9 --- /dev/null +++ b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/vbm_host_cuda_kernels.cu @@ -0,0 +1,476 @@ +// Copyright Contributors to the OpenVDB Project +// SPDX-License-Identifier: Apache-2.0 + +/*! + \file vbm_host_cuda_kernels.cu + + \brief CUDA implementation for the VoxelBlockManager test harness. + + Builds the VoxelBlockManager from a ValueOnIndex grid, decodes the full + inverse map (leafIndex, voxelOffset) for all active voxels, downloads the + result to the host, and validates it against the grid structure. + Also benchmarks CPU vs GPU VBM construction with repeated timed runs. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include +#include +#include + +namespace { + +static constexpr int Log2BlockWidth = 7; +static constexpr int BlockWidth = 1 << Log2BlockWidth; // 128 + +using VBM = nanovdb::tools::cuda::VoxelBlockManager; +using CPUVBM = nanovdb::tools::VoxelBlockManager; + +/// @brief Software 32-bit popcount (Hamming weight) via the AND/shift/add/multiply path. +/// Unlike hardware POPCNT (which is scalar), this compiles to VPMULLD under AVX2 and +/// vectorizes across all 16 lanes of a uint32_t vertical sweep over a valueMask. +inline uint32_t popcount32(uint32_t x) +{ + x = x - ((x >> 1) & 0x55555555u); + x = (x & 0x33333333u) + ((x >> 2) & 0x33333333u); + x = (x + (x >> 4)) & 0x0f0f0f0fu; + return (x * 0x01010101u) >> 24; +} + +/// @brief For each VBM block, decode the inverse map and store +/// (leafIndex, voxelOffset) for every active voxel into global output arrays +/// indexed by (globalVoxelOffset - firstOffset). +/// Launch configuration: <<>> +__global__ void decodeAllBlocksKernel( + nanovdb::NanoGrid* grid, + const uint32_t* firstLeafID, + const uint64_t* jumpMap, + uint64_t firstOffset, + uint64_t lastOffset, + uint32_t nBlocks, + uint32_t* outLeafIndex, + uint16_t* outVoxelOffset) +{ + __shared__ uint32_t smem_leafIndex[BlockWidth]; + __shared__ uint16_t smem_voxelOffset[BlockWidth]; + + uint32_t bID = blockIdx.x; + if (bID >= nBlocks) return; + + uint64_t blockFirstOffset = firstOffset + (uint64_t)bID * BlockWidth; + + VBM::decodeInverseMaps( + grid, + firstLeafID[bID], + &jumpMap[(uint64_t)bID * VBM::JumpMapLength], + blockFirstOffset, + smem_leafIndex, + smem_voxelOffset); + + int tID = threadIdx.x; + uint64_t globalIdx = blockFirstOffset + tID; + + if (globalIdx <= lastOffset && smem_leafIndex[tID] != VBM::UnusedLeafIndex) { + uint64_t k = globalIdx - firstOffset; // 0-based + outLeafIndex[k] = smem_leafIndex[tID]; + outVoxelOffset[k] = smem_voxelOffset[tID]; + } +} + +/// @brief For each VBM block, decode the inverse map into shared memory and +/// write a token value per block to dummyOut to prevent dead code elimination. +/// No per-voxel output is written to global memory. +/// Launch configuration: <<>> +__global__ void benchDecodeKernel( + nanovdb::NanoGrid* grid, + const uint32_t* firstLeafID, + const uint64_t* jumpMap, + uint64_t firstOffset, + uint64_t lastOffset, + uint32_t nBlocks, + uint32_t* dummyOut) +{ + __shared__ uint32_t smem_leafIndex[BlockWidth]; + __shared__ uint16_t smem_voxelOffset[BlockWidth]; + + uint32_t bID = blockIdx.x; + if (bID >= nBlocks) return; + + uint64_t blockFirstOffset = firstOffset + (uint64_t)bID * BlockWidth; + + VBM::decodeInverseMaps( + grid, + firstLeafID[bID], + &jumpMap[(uint64_t)bID * VBM::JumpMapLength], + blockFirstOffset, + smem_leafIndex, + smem_voxelOffset); + + // Token write from thread 0 only to prevent dead code elimination + if (threadIdx.x == 0) { + dummyOut[bID] = + (smem_leafIndex[0] != VBM::UnusedLeafIndex ? 1u : 0u) + + (smem_voxelOffset[0] != VBM::UnusedVoxelOffset ? 1u : 0u); + } +} + +} // anonymous namespace + +void runVBMCudaTest(const std::vector& coords) +{ + const uint64_t nCoords = coords.size(); + + // --- Build ValueOnIndex grid on GPU --- + nanovdb::Coord* d_coords = nullptr; + cudaCheck(cudaMalloc(&d_coords, nCoords * sizeof(nanovdb::Coord))); + cudaCheck(cudaMemcpy(d_coords, coords.data(), + nCoords * sizeof(nanovdb::Coord), cudaMemcpyHostToDevice)); + + auto handle = nanovdb::tools::cuda::voxelsToGrid( + d_coords, nCoords); + cudaCheck(cudaFree(d_coords)); + + auto* d_grid = handle.deviceGrid(); + if (!d_grid) throw std::runtime_error("Failed to create device grid"); + + // Download grid to host for validation and CPU build + handle.deviceDownload(); + auto* h_grid = handle.grid(); + if (!h_grid) throw std::runtime_error("Failed to download host grid"); + + const auto& tree = h_grid->tree(); + const uint64_t nVoxels = h_grid->activeVoxelCount(); + const uint32_t nBlocks = (uint32_t)((nVoxels + BlockWidth - 1) >> Log2BlockWidth); + + std::cout << "Active voxels (unique): " << nVoxels << "\n" + << "Leaf nodes : " << tree.nodeCount(0) << "\n" + << "Lower nodes : " << tree.nodeCount(1) << "\n" + << "Upper nodes : " << tree.nodeCount(2) << "\n" + << "VBM blocks : " << nBlocks + << " (BlockWidth=" << BlockWidth << ")\n\n"; + + // --- Benchmark VBM construction: GPU vs CPU --- + // Allocate handles once; timing runs reuse the buffers (memset + kernel only, + // no allocation overhead). First run per device serves as warmup - important + // for unified-memory buffers where the first access triggers page migration. + static constexpr int nRuns = 5; + + auto gpuHandle = nanovdb::tools::cuda::buildVoxelBlockManager(d_grid); + auto cpuHandle = nanovdb::tools::buildVoxelBlockManager(h_grid); + + // GPU build (cudaMemsetAsync + kernel, pre-allocated buffers) + { + float minMs = std::numeric_limits::max(); + for (int i = 0; i < nRuns; ++i) { + cudaCheck(cudaDeviceSynchronize()); // ensure stream is idle before timing + nanovdb::util::cuda::Timer gpuTimer; + nanovdb::tools::cuda::buildVoxelBlockManager(d_grid, gpuHandle); + float ms = gpuTimer.elapsed(); // records stop event and synchronizes + if (i > 0) minMs = std::min(minMs, ms); + } + std::cout << "GPU buildVoxelBlockManager (memset+kernel): min " << minMs + << " ms (over " << nRuns-1 << " post-warmup runs)\n"; + } + + // CPU build (std::memset + std::for_each(par), pre-allocated buffers) + { + float minMs = std::numeric_limits::max(); + for (int i = 0; i < nRuns; ++i) { + nanovdb::util::Timer cpuTimer; + cpuTimer.start(""); + nanovdb::tools::buildVoxelBlockManager(h_grid, cpuHandle); + float ms = (float)cpuTimer.elapsed() / 1000.0f; + if (i > 0) minMs = std::min(minMs, ms); + } + std::cout << "CPU buildVoxelBlockManager (memset+forEachPar): min " << minMs + << " ms (over " << nRuns-1 << " post-warmup runs)\n\n"; + } + + // --- Validate CPU build against GPU build --- + // Download GPU metadata to host and compare byte-for-byte with the CPU handle. + { + const uint64_t firstLeafIDBytes = gpuHandle.blockCount() * sizeof(uint32_t); + const uint64_t jumpMapBytes = gpuHandle.blockCount() * (BlockWidth / 64) * sizeof(uint64_t); + + std::vector gpuFirstLeafID(gpuHandle.blockCount()); + std::vector gpuJumpMap(gpuHandle.blockCount() * (BlockWidth / 64)); + + cudaCheck(cudaMemcpy(gpuFirstLeafID.data(), gpuHandle.deviceFirstLeafID(), + firstLeafIDBytes, cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(gpuJumpMap.data(), gpuHandle.deviceJumpMap(), + jumpMapBytes, cudaMemcpyDeviceToHost)); + + const bool firstLeafIDMatch = std::memcmp(gpuFirstLeafID.data(), + cpuHandle.hostFirstLeafID(), firstLeafIDBytes) == 0; + const bool jumpMapMatch = std::memcmp(gpuJumpMap.data(), + cpuHandle.hostJumpMap(), jumpMapBytes) == 0; + + if (firstLeafIDMatch && jumpMapMatch) + std::cout << "CPU/GPU metadata match: PASSED\n"; + else + std::cerr << "CPU/GPU metadata match: FAILED" + << (firstLeafIDMatch ? "" : " [firstLeafID mismatch]") + << (jumpMapMatch ? "" : " [jumpMap mismatch]") << "\n"; + } + + // gpuHandle is the last-built GPU VBM; use it for decode/validation below + auto& vbmHandle = gpuHandle; + + // --- Decode all blocks into unified memory --- + // Using thrust::universal_vector so the kernel writes directly into host-accessible + // memory, avoiding a separate cudaMemcpy download step. + thrust::universal_vector outLeafIndex(nVoxels); + thrust::universal_vector outVoxelOffset(nVoxels); + + decodeAllBlocksKernel<<>>( + d_grid, + vbmHandle.deviceFirstLeafID(), + vbmHandle.deviceJumpMap(), + vbmHandle.firstOffset(), vbmHandle.lastOffset(), nBlocks, + outLeafIndex.data().get(), outVoxelOffset.data().get()); + cudaCheckError(); + cudaCheck(cudaDeviceSynchronize()); + + // --- Validate GPU decodeInverseMaps on host --- + // For each active voxel index k+1, the decoded (leafIndex, voxelOffset) + // must satisfy: leaf[leafIndex].getValue(voxelOffset) == k+1 + { + const auto* firstLeaf = tree.getFirstNode<0>(); + uint64_t errors = 0; + + for (uint64_t k = 0; k < nVoxels; ++k) { + const uint32_t leafIdx = outLeafIndex[k]; + const uint16_t voxelOff = outVoxelOffset[k]; + const uint64_t expectedIdx = k + vbmHandle.firstOffset(); + const uint64_t decodedIdx = firstLeaf[leafIdx].getValue(voxelOff); + + if (decodedIdx != expectedIdx) { + if (errors < 5) + std::cerr << "ERROR at k=" << k + << ": expected index " << expectedIdx + << ", decoded " << decodedIdx + << " (leaf=" << leafIdx + << " voxelOff=" << voxelOff << ")\n"; + ++errors; + } + } + + if (errors == 0) + std::cout << "GPU decodeInverseMaps: PASSED (all " << nVoxels + << " entries validated)\n"; + else + std::cerr << "GPU decodeInverseMaps: FAILED (" << errors << " / " + << nVoxels << " entries incorrect)\n"; + } + + // --- Validate CPU decodeInverseMaps against GPU results --- + // cpuHandle metadata was already verified to agree with gpuHandle above. + // Run the CPU decode block-by-block and compare against the unified-memory + // GPU output (which is now host-accessible after the sync above). + { + const uint32_t* firstLeafIDPtr = cpuHandle.hostFirstLeafID(); + const uint64_t* jumpMapPtr = cpuHandle.hostJumpMap(); + + std::vector cpuLeafIndex(BlockWidth); + std::vector cpuVoxelOffset(BlockWidth); + uint64_t errors = 0; + + for (uint32_t bID = 0; bID < nBlocks; ++bID) { + const uint64_t blockFirstOffset = + cpuHandle.firstOffset() + (uint64_t)bID * BlockWidth; + + CPUVBM::decodeInverseMaps( + h_grid, + firstLeafIDPtr[bID], + &jumpMapPtr[(uint64_t)bID * CPUVBM::JumpMapLength], + blockFirstOffset, + cpuLeafIndex.data(), + cpuVoxelOffset.data()); + + for (int tID = 0; tID < BlockWidth; ++tID) { + const uint64_t k = (uint64_t)bID * BlockWidth + tID; + if (k >= nVoxels) break; + if (cpuLeafIndex[tID] != outLeafIndex[k] || + cpuVoxelOffset[tID] != outVoxelOffset[k]) { + if (errors < 5) + std::cerr << "CPU/GPU decodeInverseMaps mismatch at k=" << k + << ": cpu=(" << cpuLeafIndex[tID] << "," + << cpuVoxelOffset[tID] << ") gpu=(" + << outLeafIndex[k] << "," << outVoxelOffset[k] + << ")\n"; + ++errors; + } + } + } + + if (errors == 0) + std::cout << "CPU decodeInverseMaps vs GPU: PASSED\n"; + else + std::cerr << "CPU decodeInverseMaps vs GPU: FAILED (" + << errors << " mismatches)\n"; + } + + // --- GPU decodeInverseMaps performance benchmark --- + // benchDecodeKernel decodes into shared memory only; one token uint32_t per + // block is written to dummyOut to prevent dead code elimination. + { + thrust::universal_vector dummyOut(nBlocks, 0); + + static constexpr int nPerfRuns = 20; + + // Warmup launch (first kernel invocation may incur JIT / page-migration cost) + benchDecodeKernel<<>>( + d_grid, + vbmHandle.deviceFirstLeafID(), vbmHandle.deviceJumpMap(), + vbmHandle.firstOffset(), vbmHandle.lastOffset(), nBlocks, + dummyOut.data().get()); + cudaCheck(cudaDeviceSynchronize()); + + std::cout << "\nGPU decodeInverseMaps performance (" + << nBlocks << " blocks, " << nVoxels << " voxels):\n"; + + for (int run = 0; run < nPerfRuns; ++run) { + cudaCheck(cudaDeviceSynchronize()); + nanovdb::util::cuda::Timer gpuTimer; + benchDecodeKernel<<>>( + d_grid, + vbmHandle.deviceFirstLeafID(), vbmHandle.deviceJumpMap(), + vbmHandle.firstOffset(), vbmHandle.lastOffset(), nBlocks, + dummyOut.data().get()); + const float ms = gpuTimer.elapsed(); + std::cout << " run " << run << ": " << ms << " ms\n"; + } + + // Sum dummyOut on host (unified memory, already sync'd by last gpuTimer.elapsed()) + uint64_t dummy = 0; + for (uint32_t i = 0; i < nBlocks; ++i) dummy += dummyOut[i]; + std::cout << " (dummy=" << dummy << ")\n"; + } + + // --- CPU decodeInverseMaps performance benchmark --- + // Scratch arrays are reused across all blocks so the only memory traffic is + // reading VBM metadata and leaf data -- not writing nVoxels of output. + // Token reads of scratch[0] after each block defeat dead code elimination + // without adding meaningful cost. + { + const uint32_t* firstLeafIDPtr = cpuHandle.hostFirstLeafID(); + const uint64_t* jumpMapPtr = cpuHandle.hostJumpMap(); + + static constexpr int nPerfRuns = 20; + + // --- Sanity check: countOn + std::fill only, no tree access --- + { + std::cout << "\nCPU decodeInverseMaps sanity (countOn only):\n"; + + // Never true: used only to prevent dead code elimination. + volatile uint32_t dummy = 0; + + for (int run = 0; run < nPerfRuns; ++run) { + nanovdb::util::Timer timer; + timer.start(""); + + nanovdb::util::forEach(0, nBlocks, 1, + [&](const nanovdb::util::Range1D& range) { + for (auto bID = range.begin(); bID < range.end(); ++bID) { + int nExtraLeaves = 0; + for (int i = 0; i < CPUVBM::JumpMapLength; i++) + nExtraLeaves += nanovdb::util::countOn( + jumpMapPtr[(uint64_t)bID * CPUVBM::JumpMapLength + i]); + + // Reinterpret the first leaf's 8 x uint64_t valueMask as + // 16 x uint32_t words, one per group of 32 consecutive voxels. + const auto& leaf = + h_grid->tree().getFirstNode<0>()[firstLeafIDPtr[bID]]; + const uint32_t* maskWords = + reinterpret_cast(leaf.valueMask().words()); + + // Phase 1: per-word inclusive prefix counts. + // prefixCountRealigned[step][lane] = popcount(maskWords[lane] & mask) + // where mask covers bits 0..step (inclusive). + // At step=31, mask=0xFFFFFFFF so row[31] == wordPopcount[lane]. + // Safe mask form: (uint32_t(2) << step) - 1u avoids UB at step=31. + alignas(32) uint32_t prefixCountRealigned[32][16]; + for (int step = 0; step < 32; step++) { + const uint32_t mask = (uint32_t(2) << step) - 1u; + #pragma omp simd + for (int lane = 0; lane < 16; lane++) + prefixCountRealigned[step][lane] = + popcount32(maskWords[lane] & mask); + } + + // Phase 2: exclusive prefix scan of row[31] -> baseOffset[lane], + // then add baseOffset to every row to get global prefix counts. + uint32_t baseOffset[16]; + baseOffset[0] = 0; + for (int lane = 1; lane < 16; lane++) + baseOffset[lane] = baseOffset[lane-1] + + prefixCountRealigned[31][lane-1]; + + for (int step = 0; step < 32; step++) { + #pragma omp simd + for (int lane = 0; lane < 16; lane++) + prefixCountRealigned[step][lane] += baseOffset[lane]; + } + + // Dummy: global prefix count at the last voxel equals the total + // active voxel count for this leaf, which is <= 512, never 513. + if (prefixCountRealigned[31][15] == 513u) + dummy = prefixCountRealigned[31][15]; + } + }); + + const float ms = + (float)timer.elapsed() / 1000.0f; + std::cout << " run " << run << ": " << ms << " ms\n"; + } + } + + std::atomic dummy{0}; + + std::cout << "\nCPU decodeInverseMaps performance (" + << nBlocks << " blocks, " << nVoxels << " voxels):\n"; + + for (int run = 0; run < nPerfRuns; ++run) { + nanovdb::util::Timer timer; + timer.start(""); + + nanovdb::util::forEach(0, nBlocks, 1, + [&](const nanovdb::util::Range1D& range) { + alignas(64) uint32_t scratchLeafIndex[BlockWidth]; + alignas(64) uint16_t scratchVoxelOffset[BlockWidth]; + for (auto bID = range.begin(); bID < range.end(); ++bID) { + const uint64_t blockFirstOffset = + cpuHandle.firstOffset() + (uint64_t)bID * BlockWidth; + + CPUVBM::decodeInverseMaps( + h_grid, + firstLeafIDPtr[bID], + &jumpMapPtr[(uint64_t)bID * CPUVBM::JumpMapLength], + blockFirstOffset, + scratchLeafIndex, + scratchVoxelOffset); + + dummy += (scratchLeafIndex[0] != CPUVBM::UnusedLeafIndex) ? 1u : 0u; + dummy += (scratchVoxelOffset[0] != CPUVBM::UnusedVoxelOffset) ? 1u : 0u; + } + }); + + const float ms = + (float)timer.elapsed() / 1000.0f; + std::cout << " run " << run << ": " << ms << " ms\n"; + } + std::cout << " (dummy=" << dummy.load() << ")\n"; + } +} From 372b806509aeb35259887ca749a9137929613685 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Thu, 2 Apr 2026 13:50:37 -0500 Subject: [PATCH 03/60] VBMImplementationKnowledge: distill decodeInverseMaps design philosophy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove DecodeInverseMapsCPUPlan.md (implementation complete) and distill its non-obvious design decisions into the knowledge base: - §11: decodeInverseMaps is intentionally single-threaded/stateless; caller distributes blocks; contrast with cooperative GPU version. - §12: mPrefixSum is bypassed for bulk access — recomputing from raw mask words via buildMaskPrefixSums is cheaper than unpacking 9-bit fields; mPrefixSum is still used for the cross-word offset in Step 5. - §13: output fill is range-fill + contiguous copy (not scatter) because shuffleDownMask produces a sorted compacted array; std::fill/copy caveat for alignment when output arrays come from TLS or stack pointers. Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Efty Sifakis --- .../VBMImplementationKnowledge.md | 70 ++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) diff --git a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/VBMImplementationKnowledge.md b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/VBMImplementationKnowledge.md index 2aa902e7be..105e769b9a 100644 --- a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/VBMImplementationKnowledge.md +++ b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/VBMImplementationKnowledge.md @@ -4,7 +4,7 @@ This document captures non-obvious design decisions, rejected alternatives, inva and performance phenomenology for the VoxelBlockManager (VBM) subsystem and its CPU port of `decodeInverseMaps`. It is written as dense, structured, agent-consumable facts rather than narrative. It complements `VoxelBlockManagerContext.md` (semantics -and API) and `DecodeInverseMapsCPUPlan.md` (algorithm design journal). +and API). --- @@ -267,3 +267,71 @@ This means active voxel ranges partition the global index space without gaps. all subsequent leaves (Invariant B), so `break` is correct instead of `continue`. - The early-exit collapses to a single line: `if (leafFirstOffset >= blockFirstOffset + BlockWidth) break;` + +--- + +## 11. decodeInverseMaps CPU — Threading and Calling Philosophy + +`decodeInverseMaps` (CPU) is **intentionally single-threaded and stateless**. It decodes +exactly one voxel block per call. The caller (`buildVoxelBlockManager` CPU path) distributes +blocks across threads via `util::forEach` (TBB or `std::thread`). There is no OpenMP thread +parallelism inside the function — `#pragma omp simd` is present only to hint the compiler +toward SIMD vectorization, not to spawn threads. + +**Consequence for callers**: the output arrays `leafIndex[BlockWidth]` and +`voxelOffset[BlockWidth]` must be caller-allocated. The natural pattern is stack allocation +inside the `util::forEach` lambda, one array per block per thread. Heap allocation (e.g., +`new[]` per block) is wasteful; thread-local storage is an option but introduces alignment +complications (see §12). + +**Contrast with GPU**: the GPU `decodeInverseMaps` is cooperative — a full CUDA thread block +(up to 512 threads) decodes one voxel block together, writing to shared memory. On CPU, one +thread decodes one block sequentially but with SIMD across the 512 voxel positions. + +--- + +## 12. Why mPrefixSum Is Not Used in decodeInverseMaps + +`LeafIndexBase::mPrefixSum` encodes per-word exclusive cumulative popcounts as 7 × 9-bit +packed fields (see §2). It is designed for **random access** — locating the global sequential +index of a single voxel in O(1) without scanning the whole mask. + +`decodeInverseMaps` needs the full 512-entry prefix-sum table for every active voxel in the +leaf. For this **bulk sequential access**, recomputing per-word popcounts from scratch via the +`buildMaskPrefixSums` SIMD algorithm (§3) is cheaper than unpacking the 9-bit fields. +Specifically: +- Unpacking 7 × 9-bit fields requires masked shifts and is awkward to vectorize. +- `buildMaskPrefixSums` operates on the raw 8 `uint64_t` words, uses only shifts and adds, and + vectorizes cleanly to 8 AVX2 `vpaddq`/`vpsllq` pairs. +- At 25% occupancy (typical level-set narrow band), `buildMaskPrefixSums` consumes a small + fraction of the total per-leaf work. + +`mPrefixSum` is still used indirectly: it is the `prefixSum` argument passed to +`buildMaskPrefixSums` to fold in the cross-word exclusive offsets (Step 5 of the algorithm). +What is bypassed is the idea of using `mPrefixSum` alone (without a full table build) to compute +individual voxel offsets one at a time. + +--- + +## 13. Output Fill Structure — Range Fill + Contiguous Copy, Not Scatter + +The CPU `decodeInverseMaps` fills its output arrays per leaf as: + +``` +leafIndex[pStart..pEnd) = leafID (range fill, constant value) +voxelOffset[pStart..pEnd) = leafLocalOffsets[jStart..jStart+(pEnd-pStart)) (contiguous copy) +``` + +**Why contiguous copy (not scatter)**: `shuffleDownMask` produces `leafLocalOffsets` as a +compacted array of active local positions in ascending order — position 0 holds the 0-th active +voxel's leaf-local offset, position 1 holds the 1-st, etc. Since the block's output range +`[pStart, pEnd)` maps to a contiguous slice `[jStart, jStart+len)` of this compacted array, a +single `std::copy` (or `memcpy`) suffices. No scatter is needed. + +**`std::fill`/`std::copy` vectorization caveat**: if the output pointers are stack-allocated or +derived from TLS, the compiler may not prove alignment and fall back to scalar or SSE stores even +under `-mavx2`. For the sentinel-fill initialization (filling `UnusedLeafIndex` / +`UnusedVoxelOffset` before the leaf loop), this can dominate runtime. Measurements on the test +machine showed `std::fill` via TLS taking ~1.5 ms vs ~0.22 ms with explicit `_mm256_store_si256` +over `alignas(64)` arrays (16384 blocks, 32 threads). Stack-allocate the output arrays with +`alignas(64)` and replace `std::fill` with explicit AVX2 stores when performance matters. From 9a31aeda7cf72f67b026bc2210ba482e273a8558 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Thu, 2 Apr 2026 13:50:42 -0500 Subject: [PATCH 04/60] ex_voxelBlockManager_host_cuda: remove DecodeInverseMapsCPUPlan.md Implementation complete; design rationale distilled into VBMImplementationKnowledge.md. Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Efty Sifakis --- .../DecodeInverseMapsCPUPlan.md | 1271 ----------------- 1 file changed, 1271 deletions(-) delete mode 100644 nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/DecodeInverseMapsCPUPlan.md diff --git a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/DecodeInverseMapsCPUPlan.md b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/DecodeInverseMapsCPUPlan.md deleted file mode 100644 index 6628ab4eee..0000000000 --- a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/DecodeInverseMapsCPUPlan.md +++ /dev/null @@ -1,1271 +0,0 @@ -# CPU `decodeInverseMaps` Implementation Plan - -This document captures the full design for porting `VoxelBlockManager::decodeInverseMaps` -to the host. It is the companion to `VoxelBlockManagerContext.md` and serves as a starting point -for implementing the function in `nanovdb/tools/VoxelBlockManager.h`. - ---- - -## 1. Threading Model - -The GPU version uses a thread block (up to 512 threads) to decode one voxel block cooperatively. -The CPU version inverts the axes: - -- **Outer**: OpenMP thread parallelism over voxel blocks (one block per thread). -- **Inner**: SIMD within `decodeInverseMaps` for the single block assigned to the calling thread. - -`decodeInverseMaps` itself is single-threaded. It does not use OpenMP internally. The caller -(the `buildVoxelBlockManager` CPU path) is responsible for distributing blocks across threads. - ---- - -## 2. Outputs - -Like the GPU version, the function fills two arrays for a single voxel block: - -```cpp -uint32_t leafIndex[BlockWidth]; // smem_leafIndex on GPU -uint16_t voxelOffset[BlockWidth]; // smem_voxelOffset on GPU -``` - -Sentinel values for positions beyond the last active voxel: -```cpp -static constexpr uint32_t UnusedLeafIndex = 0xffffffff; -static constexpr uint16_t UnusedVoxelOffset = 0xffff; -``` - ---- - -## 3. Leaf Iteration (unchanged from GPU) - -Iterate leaf IDs from `firstLeafID[blockID]` through `firstLeafID[blockID] + nExtraLeaves`, -where `nExtraLeaves` is `popcount(jumpMap[blockID * JumpMapLength .. +JumpMapLength])`. - -For each leaf, compute: -- `leafFirstOffset = leaf.data()->firstOffset()` -- `leafValueCount = leaf.data()->valueCount()` ← number of active voxels in this leaf -- `pStart = max(0, leafFirstOffset - blockFirstOffset)` — first slot in the block's output arrays -- `pEnd = min(BlockWidth, leafFirstOffset + leafValueCount - blockFirstOffset)` — one past last slot -- `jStart = (leafFirstOffset < blockFirstOffset) ? blockFirstOffset - leafFirstOffset : 0` - — index of first active voxel in the leaf that falls inside this block - -Then: -- `leafIndex[pStart..pEnd) = leafID` (range fill, no scatter) -- `voxelOffset[pStart..pEnd) = leafLocalOffsets[jStart..jStart+(pEnd-pStart))` (contiguous copy) - -where `leafLocalOffsets[j]` = local offset (0..511) of the j-th active voxel in this leaf. - ---- - -## 4. Producing `leafLocalOffsets` via Stream Compaction - -`leafLocalOffsets` is the stream compaction of {0, 1, …, 511} under `valueMask`. It maps -dense index j → local offset of the j-th active voxel. - -The GPU scatter `smem_voxelOffset[index - blockFirstOffset] = localOffset` is *equivalent* to -this compaction: as `localOffset` increases 0..511, the scatter destination -`index - blockFirstOffset` is non-decreasing. So the scatter is really: pack all active local -offsets in order → copy a contiguous slice. - -### 4a. SIMD Word Layout: 16 × 32-bit Words - -Treat `valueMask` as **16 `uint32_t` words** (not 8 `uint64_t`). Rationale: - -- 16 words fill a full AVX-512 register (16 × 32-bit lanes) or two AVX2 registers (8 × 32-bit lanes). -- The multiply step in the vertical sweep (see §5) uses `VPMULLD` (AVX2, SSE4.1) for 32-bit - multiply, which is widely available. The 64-bit equivalent `VPMULLQ` requires AVX-512DQ. -- Software popcount on 32-bit words uses AND/shift/add/multiply — all available as 32-bit SIMD - ops in AVX2. A `#pragma omp simd` loop over 16 words auto-vectorizes without `VPOPCNTQ`. - -### 4b. Storage Layout: `prefixCountRealigned[32][16]` - -Declare a `uint32_t` array shaped as `[bitStep][lane]`: - -```cpp -alignas(32) uint32_t prefixCountRealigned[/*bitStep*/32][/*lane*/16]; -``` - -- **lane** (0..15): indexes which 32-bit word of the valueMask (one per group of 32 consecutive voxels). -- **bitStep** (0..31): indexes bit position within the word. -- `prefixCountRealigned[step][lane]` = **inclusive** prefix popcount = number of active voxels - in positions 0..step of word `lane`. - -Storage is `uint32_t` throughout to match `popcount32`'s natural precision and avoid -narrowing conversions. A full row `prefixCountRealigned[step]` is 16 × 4 = 64 bytes: -- **AVX2**: two `__m256i` registers per row (8 uint32_t each) -- **AVX-512**: one `__m512i` register per row (16 uint32_t) — the layout is designed for this upgrade - -### 4c. Phase 1 — Per-Word Inclusive Prefix Counts (SIMD) - -For each `step` in 0..31, compute `prefixCountRealigned[step][lane]` for all 16 lanes -simultaneously via a `#pragma omp simd` loop: - -```cpp -const uint32_t* maskWords = - reinterpret_cast(leaf.valueMask().words()); - -for (int step = 0; step < 32; step++) { - // TODO: use (uint32_t(2) << step) - 1u, NOT (1u << (step+1)) - 1u - // The latter is UB at step=31 (shift by 32 on a 32-bit type). - // The safe form: at step=31, (2u << 31) overflows to 0 (defined for unsigned), - // and 0 - 1u wraps to 0xFFFFFFFF (all bits set) — correct inclusive mask. - const uint32_t mask = (uint32_t(2) << step) - 1u; - #pragma omp simd - for (int lane = 0; lane < 16; lane++) - prefixCountRealigned[step][lane] = popcount32(maskWords[lane] & mask); -} -``` - -At `step=31`, `mask = 0xFFFFFFFF`, so `prefixCountRealigned[31][lane] = wordPopcount[lane]` -(the full per-word active voxel count) — no separate word-popcount pass needed. - -### 4d. Phase 2 — Cross-Word Prefix Sum and Global Conversion - -Read the last row to get per-word counts, compute their exclusive prefix scan (scalar — short -dependency chain), then add `baseOffset[lane]` to every row in a second SIMD pass: - -```cpp -// Exclusive prefix scan of the last row → baseOffset[lane] -uint32_t baseOffset[16]; -baseOffset[0] = 0; -for (int lane = 1; lane < 16; lane++) - baseOffset[lane] = baseOffset[lane-1] + prefixCountRealigned[31][lane-1]; - -// Add baseOffset to every row: converts per-word to global prefix counts -for (int step = 0; step < 32; step++) { - #pragma omp simd - for (int lane = 0; lane < 16; lane++) - prefixCountRealigned[step][lane] += baseOffset[lane]; -} -``` - -`baseOffset` is constant across all 32 steps for a given lane, so each row's SIMD add is a -simple lane-wise addition with no broadcast required. After this pass, -`prefixCountRealigned[step][lane]` holds the full global inclusive prefix count for voxel -`step + 32*lane` — i.e., the sequential index of that voxel within the leaf (0-based) if it -is active, counting all active voxels before it across all words. - -### 4e. Parallel Prefix Compaction via `shfl_down` (Alternative / Deeper SIMD) - -This approach avoids data-dependent stores entirely and is the approach validated in -`simd_test/shfl_down_test.cpp`. - -**Key insight**: Define `shifts[i]` = number of zeros before position i in the bitmask = -`i - (dense_index_of_voxel_i)`. The compaction moves each active voxel at position i down by -`shifts[i]`. Decompose `shifts[i]` in binary: apply log2(BlockWidth) passes. Pass k moves -elements down by 2^k *if* bit k of `shifts[i]` is set: - -```cpp -// Templated fixed-offset conditional blend -template -void shfl_down(uint16_t* data, const bool* move) { - #pragma omp simd - for (int i = 0; i < Width - Shift; i++) - if (move[i]) data[i] = data[i + Shift]; -} -``` - -Each pass is a **fixed-offset conditional copy** — the write index is data-independent. -Compiles to clean masked blend operations: -- **AVX-512**: `vmovdqu32` with a mask register (single instruction per pass) -- **AVX2**: double-negate blend pattern (no register-level shuffle needed) - -The `move[i]` predicate for pass k is: `(shifts[i] & (1 << k)) != 0`, which itself depends on -the bitmask but can be computed upfront via popcount before the blend passes. - -**Practical recommendation**: Start with the simpler vertical sweep (§4c/§4d). Fall back to -`shfl_down` if the compiler fails to vectorize the conditional store or if profiling shows it -is the bottleneck. - -**TODO**: Investigate whether this collective SIMD prefix-popcount approach could benefit the -CUDA `decodeInverseMaps` as well. The current GPU implementation iterates all 512 voxel slots -via `getValue()` (one thread per slot cooperatively across the warp), which is already very fast -(~0.039 ms for 16384 blocks). Given that baseline, a rewrite is unlikely to be worthwhile, but -it may be worth a quick look once the CPU path is mature. - ---- - -## 5. Bypassing `mPrefixSum` - -The leaf stores a packed 9-bit `mPrefixSum` for random access. Do **not** use it here. - -For bulk sequential access over all 512 voxels, recomputing per-word popcounts from scratch via -SIMD is cheaper than unpacking the 9-bit packed fields (which requires masked shifts and is -awkward to vectorize). The vertical sweep (§4c) naturally computes exactly what is needed. - ---- - -## 6. `leafIndex` Fill (Trivial) - -```cpp -std::fill(leafIndex + pStart, leafIndex + pEnd, (uint32_t)leafID); -``` - -No scatter. `leafID` is constant per leaf. - ---- - -## 7. `voxelOffset` Fill (Contiguous Copy) - -```cpp -std::copy(leafLocalOffsets + jStart, - leafLocalOffsets + jStart + (pEnd - pStart), - voxelOffset + pStart); -``` - -`leafLocalOffsets` is produced once per leaf (§4) and then sliced into the output array. - ---- - -## 8. Initialization - -Before iterating over leaves, initialize sentinel values for the whole block: - -```cpp -std::fill(leafIndex, leafIndex + BlockWidth, UnusedLeafIndex); -std::fill(voxelOffset, voxelOffset + BlockWidth, UnusedVoxelOffset); -``` - -**Important:** `std::fill` on a `threadprivate` TLS pointer does **not** auto-vectorize to AVX2 -stores even when `-mavx2` is enabled. The compiler cannot prove alignment through the TLS -indirection, so it falls back to scalar or SSE stores. Explicit AVX2 intrinsics with an -`(__m256i*)` cast are required to get `vmovdqa` and recover the expected bandwidth. On the test -machine (no AVX-512), using explicit `_mm256_store_si256` over `alignas(64)` arrays brought the -initialization cost from ~1.5 ms down to ~0.22 ms for 16384 blocks across 32 OMP threads. - -The same issue will affect the `voxelOffset` range-fill and `leafIndex` range-fill in the -optimized path (§6 and §7): if the output arrays are caller-allocated (stack or TLS), `std::fill` -and `std::copy` should be replaced with explicit AVX2 stores where performance matters. - ---- - -## 9. Function Signature (Proposed) - -The CPU version mirrors the GPU signature but with plain pointers (no `__device__`, no shared -memory, no sync): - -```cpp -template -template -void VoxelBlockManager::decodeInverseMaps( - const NanoGrid* grid, - uint32_t blockID, - const uint32_t* firstLeafID, - const uint64_t* jumpMap, - uint64_t blockFirstOffset, - uint32_t* leafIndex, // output, length BlockWidth - uint16_t* voxelOffset) // output, length BlockWidth -``` - -Or as a free function in an `cpu` sub-namespace alongside `buildVoxelBlockManager` in -`VoxelBlockManager.h`. - ---- - -## 10. Future Factoring - -Once `VoxelBlockManager` is annotated `__hostdev__` on all its members, the -per-leaf logic shared between the CPU and GPU builds can be factored into a `__hostdev__ static` -member (e.g., `accumulateLeafContribution(...)`) — see `project_vbm_factoring.md` in the memory -directory. The `decodeInverseMaps` CPU/GPU split is a separate concern (SIMD vs warp cooperation) -and will likely remain two implementations even after factoring. - ---- - -## 11. SIMD Codegen Experiment: `shfl_down` - -The `simd_test/` directory (not checked into the repo) contained two source files and four -assembly listings produced by GCC 13.3 (`-O3 -march=avx512f` / `-O3 -march=avx2`). - -### Source (both files identical except for the pragma) - -```cpp -// shfl_down_test.cpp — WITH #pragma omp simd -// shfl_down_nosimd.cpp — WITHOUT #pragma omp simd (testing auto-vectorization alone) - -// Conditional blend: for j in [0, Width-Shift): -// out[j] = (shifts[j+Shift] & Shift) ? in[j+Shift] : in[j] -// for j in [Width-Shift, Width): -// out[j] = in[j] -template -void shfl_down(const T* __restrict__ in, - const int* __restrict__ shifts, - T* __restrict__ out) -{ -#pragma omp simd // omitted in shfl_down_nosimd.cpp - for (int j = 0; j < Width - Shift; j++) - out[j] = (shifts[j + Shift] & Shift) ? in[j + Shift] : in[j]; - - for (int j = Width - Shift; j < Width; j++) - out[j] = in[j]; -} - -// Instantiated for Shift = 1, 2, 4, 8, 16, 32, 64 with T=uint32_t, Width=128 -``` - -### Assembly patterns observed - -**AVX-512** (both files produced identical output — auto-vectorization sufficed): -```asm -; Per 16-element chunk, for Shift=S: -vpbroadcastd S, %zmm0 ; broadcast shift constant -vpandd S*4(%rsi), %zmm0, %zmm2 ; mask = shifts[j+S] & S -vpcmpd $4, %zmm1, %zmm2, %k1 ; k1 = mask != 0 (take in[j+S]) -vpcmpd $0, %zmm1, %zmm2, %k2 ; k2 = mask == 0 (take in[j]) -vmovdqu32 S*4(%rdi), %zmm3{%k1}{z} ; load in[j+S] where mask != 0 -vmovdqu32 (%rdi), %zmm2{%k2}{z} ; load in[j] where mask == 0 -vmovdqa32 %zmm3, %zmm2{%k1} ; merge -vmovdqu32 %zmm2, (%rdx) ; store -``` -Each pass: 2 compares + 2 masked zero-loads + 1 masked merge + 1 store per 16 elements. - -**AVX2** (both files produced identical output): -```asm -; Per 8-element chunk: -vpand S*4(%rsi), %ymm1, %ymm3 ; mask = shifts[j+S] & S -vpcmpeqd %ymm0, %ymm3, %ymm3 ; ymm3 = (mask == 0) — "take in[j]" predicate -vpmaskmovd (%rdi), %ymm3, %ymm4 ; load in[j] where mask == 0 -vpcmpeqd %ymm0, %ymm3, %ymm2 ; ymm2 = (mask != 0) — "take in[j+S]" predicate -vpmaskmovd S*4(%rdi), %ymm2, %ymm2 ; load in[j+S] where mask != 0 -vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 ; blend: ymm3 selects in[j], ymm2 selects in[j+S] -vmovdqu %ymm2, (%rdx) ; store -``` -Each pass: `vpand` + 2×`vpcmpeqd` + 2×`vpmaskmovd` + `vpblendvb` + store per 8 elements. - -### Key findings - -1. **`#pragma omp simd` was not needed on GCC 13.3** — the `nosimd` version auto-vectorized - to identical output on both AVX-512 and AVX2. The pragma is still recommended for portability - across compilers with weaker auto-vectorization. It is safe to use without guards: unknown - pragmas are silently ignored by standard-conforming C++ compilers (C++17 §16.6), and all - major compilers recognize the `omp` namespace even without OpenMP enabled. - -2. **No architecture-specific intrinsics needed.** A single portable source compiles to optimal - SIMD on both targets. - -3. **No register-level shuffle instructions** (`vpermps`, `vpshufb`, etc.) appear anywhere. The - fixed compile-time offset is treated as a constant address displacement — the "shuffle" is - simply a load from `in + Shift`, which is a free addressing mode. - -4. **AVX-512 is cleaner**: 5 instructions vs AVX2's 7 per chunk, and uses mask registers - instead of `vpblendvb`. - -5. **Software `popcount32`** (Hamming weight via AND/shift/add/multiply) auto-vectorizes to - `VPMULLD` on both AVX2 and AVX-512. `VPOPCNTQ` (AVX-512VPOPCNTDQ) is **not** required. - -6. **`__restrict__` is load-bearing, not just a hint.** Without it the compiler must assume - `in` and `out` may alias, making vectorization of the loop illegal (writes to `out[j]` could - affect subsequent reads of `in[j+Shift]`). The experiment results are only valid because - `__restrict__` was present. - - `__restrict__` is a compiler extension, not standard C++ (`restrict` is C99 only). For - portability a macro is needed. NanoVDB has no existing C++ macro for this — `CNanoVDB.h` - defines `RESTRICT __restrict` but that is for the C API only. A new macro should be added: - - ```cpp - #if defined(_MSC_VER) - # define NANOVDB_RESTRICT __restrict - #else - # define NANOVDB_RESTRICT __restrict__ - #endif - ``` - - This matches the pattern used by `_CCCL_RESTRICT` in the bundled CCCL dependency. - ---- - -## 12. Benchmarking Findings (ex_voxelBlockManager_host_cuda) - -Measurements on the test machine (32 OMP threads, BlockWidth=128, 16384 blocks / 2M active -voxels, AVX2 but no AVX-512). - -### Baseline numbers - -| Path | Time per full pass | -|------|--------------------| -| GPU `decodeInverseMaps` (all blocks, `benchDecodeKernel`) | ~0.039 ms | -| CPU `decodeInverseMaps`, 32 OMP threads, unoptimized (`getValue()` loop) | ~77 ms | -| CPU initialization only (AVX2 stores, 32 threads) | ~0.22 ms | -| CPU OMP scheduling overhead (empty loop body, 16384 iterations) | ~0.002 ms | - -The GPU/CPU gap is ~2000×. The `getValue()` loop accounts for essentially all of the CPU cost. - -### OMP parallelism - -The outer loop over blocks (`#pragma omp for schedule(static)`) parallelizes correctly — a -fill-only sanity check scaled from ~77ms (single-thread equivalent) to ~1.5ms with 32 threads -(~40×). However the full `decodeInverseMaps` showed **zero scaling** with OMP threads. This -confirms the bottleneck is memory-bandwidth or cache-thrashing in the `getValue()` traversal, -not compute: all 32 threads together saturate available bandwidth accessing leaf data, giving no -wall-time improvement over serial. - -### `getValue()` is the bottleneck - -`getValue(localOffset)` on a `ValueOnIndex` leaf accesses `mValueMask` and the packed -`mPrefixSum` field to compute the sequential index. It is read-only but touches leaf node data -for every one of 512 slots per leaf, for every leaf overlapping the block. The unoptimized path -is O(512 × nLeaves) memory accesses per block rather than O(64 bytes of valueMask) per leaf. -Replacing this with the prefix-array approach (§4) is the primary optimization target. - -### Build flags - -`-mavx2` must be passed explicitly to both the host compiler and nvcc (`-Xcompiler -mavx2`). -Without it, `std::fill` on TLS pointers generates scalar stores. The flag is set in -`examples/CMakeLists.txt` via `target_compile_options` for `ex_voxelBlockManager_host_cuda`. - -### `prefix_popcnt_bench` standalone micro-benchmark - -`prefix_popcnt_bench.cpp` (in the same directory) isolates the Phase 1 + Phase 2 computation — -1M blocks, each with a runtime-unknown 16-word mask generated by an LCG, single-threaded, -`prefixCountRealigned[32][16]` allocated outside the loop. Results on the test machine (AVX2, -no AVX-512, GCC 13.3, `-O3 -mavx2`): - -| Implementation | Min time (1M blocks) | ns/block | -|----------------|---------------------|----------| -| Auto-vectorized (`popcount32` + `#pragma omp simd`) | ~130 ms | ~124 ns | -| Auto-vectorized with `-mno-popcnt` | ~101 ms | ~96 ns | -| Explicit AVX2 intrinsics (`vpshufb` nibble-table) | ~70 ms | ~66.5 ns | - -**Key finding — `#pragma omp simd` is silently defeated by `-mavx2`.** -When hardware POPCNT is available (implied by `-mavx2` on x86), GCC replaces the `popcount32` -Hamming-weight expression with the scalar `popcntl` instruction and then runs the lane loop -scalar. The `#pragma omp simd` hint is ignored because the compiler considers scalar `popcntl` -cheaper than the vectorized software path. The result is 16 sequential `popcntl` calls per step, -not a SIMD operation across all 16 lanes. - -With `-mno-popcnt`, GCC falls back to the software Hamming weight and auto-vectorizes correctly -to the 2×`__m256i` path (~96 ns). However `-mno-popcnt` is not suitable for production (it -disables hardware POPCNT throughout the TU, including places like `countOn` where it is wanted). - -**Explicit `vpshufb` intrinsics** (`computePrefixPopcntAVX2` in `prefix_popcnt_bench.cpp`) -bypass this issue entirely: the nibble-table lookup uses ~10 SIMD instructions per step across -all 16 lanes, without any `popcntl` in sight. At ~66.5 ns/block this is **1.87× faster** than -the auto-vectorized baseline and is the approach to use in the optimized CPU `decodeInverseMaps`. - -`vpshufb` popcount recipe (8 uint32 lanes per `__m256i`, applied twice for 16 lanes): -```cpp -// lut[i] = popcount(i), for i in 0..15, replicated in both 128-bit lanes -const __m256i lut = _mm256_set_epi8(4,3,3,2,3,2,2,1,3,2,2,1,2,1,1,0, - 4,3,3,2,3,2,2,1,3,2,2,1,2,1,1,0); -const __m256i low4 = _mm256_set1_epi8(0x0f); -const __m256i ones8 = _mm256_set1_epi8(1); -const __m256i ones16= _mm256_set1_epi16(1); - -__m256i lo = _mm256_and_si256(v, low4); // low nibbles -__m256i hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), low4); // high nibbles -__m256i cnt = _mm256_add_epi8(_mm256_shuffle_epi8(lut, lo), - _mm256_shuffle_epi8(lut, hi)); // byte popcounts -__m256i s = _mm256_madd_epi16(_mm256_maddubs_epi16(cnt, ones8), ones16); // sum → 32-bit -``` -`maskWords` (constant across all 32 steps) should be loaded into two `__m256i` registers once -before the step loop — the compiler will hoist the broadcasts of `lut`, `low4`, `ones8`, `ones16` -automatically. - ---- - -## 13. Alternative Algorithm: Bit-Parallel Z+Y Prefix Sum - -This section records an alternative algorithm under investigation for computing -`uint16_t prefixSums[512]` (exclusive linear prefix popcount per voxel) from a `Mask<3>` -(`valueMask` of a leaf node), using bit-parallel operations on the 8 × 64-bit mask words. -The algorithm is implemented and tested in `simd_test/within_word_prefix_test.cpp`. - -### 13a. Data Layout - -```cpp -union qword { uint64_t ui64; uint8_t ui8[8]; }; -static constexpr uint64_t kSpread = 0x0101010101010101ULL; - -qword data[8][8]; // indexed [z][x] -// data[z][x].ui8[y] ↔ voxel (x, y, z), x = word index, y*8+z = bit within word -``` - -NanoVDB leaf linear index: `i = x*64 + y*8 + z`. Word index = x (0..7), within-word bit -position = y\*8+z, with z as fast index (bits 0..2 of each 8-bit group) and y as slow (byte -index 0..7 within the 64-bit word). - -`data[z][:]` is contiguous — 64 bytes = one cache line = two YMM registers. This enables -`#pragma omp simd` over x in both passes below. - -### 13b. Z-Pass: Indicator Fill + Running Sum - -```cpp -// z=0: extract bit 0 from each byte of each word -#pragma omp simd -for (int x = 0; x < 8; x++) - data[0][x].ui64 = maskWords[x] & kSpread; - -// z=1..7: accumulate bit z from each byte, running sum over z -for (int z = 1; z < 8; z++) { - #pragma omp simd - for (int x = 0; x < 8; x++) - data[z][x].ui64 = data[z-1][x].ui64 + ((maskWords[x] >> z) & kSpread); -} -``` - -After this pass: `data[z][x].ui8[y]` = Σ_{z'≤z} bit(x, y, z') — per-column z-prefix for -each (x, y). Per-byte maximum = 8; fits in `uint8_t` with no inter-byte carry. `vpaddq` -and `vpaddb` are equivalent here. - -**Latency hiding**: the indicator fill `(maskWords[x] >> z) & kSpread` is independent of -`data[z-1][x]`, so the OOO engine can issue it during the 1-cycle `vpaddq` latency. The -7-step dependency chain runs at ~1 cycle/step (throughput-bound, not latency-bound). - -### 13c. Y-Pass: Hillis-Steele Prefix Scan Within uint64 - -```cpp -for (int z = 0; z < 8; z++) { - #pragma omp simd - for (int x = 0; x < 8; x++) { - data[z][x].ui64 += data[z][x].ui64 << 8; - data[z][x].ui64 += data[z][x].ui64 << 16; - data[z][x].ui64 += data[z][x].ui64 << 32; - } -} -``` - -`vpsllq imm8` is fully supported in AVX2 (1-cycle throughput, 1-cycle latency). Per-byte -maximum after this pass: 64 (8 z-values × 8 y-values); still fits in `uint8_t`. No -inter-byte carry corruption since bytes evolve independently under byte-parallel arithmetic. - -After this pass: `data[z][x].ui8[y]` = **2D rectangle inclusive sum** = -Σ_{y'≤y, z'≤z} bit(x, y', z'). - -### 13d. Assembly Quality (GCC 13.3, -O3 -march=core-avx2) - -The compiler fully unrolls both passes and keeps all intermediate values register-resident. -The z-pass processes `data[z][:]` two YMM registers at a time (x=0..3 and x=4..7), with -one spill (z=7, x=0..3 half) due to requiring all 16 YMM registers simultaneously. The -y-pass operates directly on the register-resident z-pass results without reloading from -memory. The only missed optimization is 16 dead stores from the z-pass that are immediately -overwritten by the y-pass. Overall this is essentially what hand-written intrinsics would -produce. - -### 13e. 2D Rectangle vs Linear Prefix (Correctness Finding) - -**Key finding from `simd_test/within_word_prefix_test.cpp`**: the z+y algorithm computes a -**2D rectangle sum**, not the linear prefix sum that `getValue()` uses. - -`getValue()` for `ValueOnIndex` computes: `countOn(w & ((1ULL << (y*8+z)) - 1))` = exclusive -count of set bits at positions 0..y\*8+z−1 within word x. This is a **linear** prefix (a -staircase: all bits in rows 0..y−1 plus bits in row y up to column z). - -The 2D rectangle sum Σ_{y'≤y, z'≤z} bit(x,y',z') counts only up to column z in every -preceding row, missing the "row tails" for y' < y. Test result on 1000 random masks: -2D rectangle matches its own reference at 100% (512000/512000); linear inclusive match is -only ~26% (132806/512000), confirming the discrepancy. - -First mismatch example: at (x=0, y=1, z=0), 2D rect = 2 (bits at y=0,z=0 and y=1,z=0), -linear inclusive = 7 (all 7 bits at positions 0..8 in the word). - -### 13f. Rectangle→Linear Fixup - -The linear inclusive prefix at (x, y, z) can be recovered from the 2D rectangle data as: - -``` -linear_incl(x, y, z) = data[7][x].ui8[y-1] // all complete rows 0..y-1 (z'=0..7) - + data[z][x].ui8[y] // current row y, columns 0..z - - data[z][x].ui8[y-1] // subtract over-counted rectangle below -``` - -This simplifies to adding a y-dependent correction, expressible as a byte-parallel operation: - -```cpp -for (int z = 0; z < 8; z++) { - #pragma omp simd - for (int x = 0; x < 8; x++) - data[z][x].ui64 += (data[7][x].ui64 - data[z][x].ui64) >> 8; -} -``` - -`data[7][x].ui64` (available in registers after the y-pass) gives the full per-row popcounts -packed in bytes; the byte-shift-right-by-8 shifts row y−1's value into row y's byte lane. -This fixup is cheap — one subtract and one shift per (z, x) pair, all in-place in the -byte-packed representation. - -### 13g. Cross-Word Offsets (mPrefixSum) - -`LeafIndexBase::mPrefixSum` stores 7 nine-bit cumulative popcounts (the exclusive prefix -scan at word boundaries): - -- bits 0–8: Σ_{j=0}^{0} countOn(words[j]) = exclusive prefix at x=1 -- bits 9–17: Σ_{j=0}^{1} countOn(words[j]) = exclusive prefix at x=2 -- ... -- bits 54–62: Σ_{j=0}^{6} countOn(words[j]) = exclusive prefix at x=7 - -These are available for free and must be added to `data[z][x].ui8[y]` to obtain the full -global sequential index. However, these offsets require up to 9 bits (max value = 512), -which exceeds `uint8_t`. Two approaches for incorporating them: - -**Approach #1 — Pack offsets into a uint64 byte lane and vpaddq directly.** This fails for -any leaf where the cross-word cumulative count exceeds 255 (i.e., more than ~255 active -voxels in the preceding words — reachable for moderately dense leaves by the 4th word). -Only viable for very sparse leaves. - -**Approach #2 — Transpose to uint16_t prefixSums[8][8][8].** Unpack the byte-packed -`data[z][x].ui8[y]` into `uint16_t prefixSums[x][y][z]` (indexed [x][y][z] = linear order), -then add the 9-bit cross-word offsets in the wider format. Widening is safe; all values fit -in uint16_t (max = 512). The cost is a 3D index-permutation transpose -`(z,x,y) → (x,y,z)` on 64 bytes → 128 bytes. - -### 13h. Transposition Cost and Alternatives - -The output transpose (approach #2) is expensive in isolation: no loop ordering gives a -unit-stride inner loop for both source and destination simultaneously, so GCC cannot -auto-vectorize it. With explicit AVX2 intrinsics (8×8 byte matrix transpose per x-slice, -8 slices) the cost is ~200 instructions; even scalar it is ~512 operations on L1-resident -data (~400–800 cycles), dominating the ~14-cycle z+y passes. - -**Bit-transpose alternative**: pre-transpose the 8 input uint64_t words (64 bytes) instead -of post-transposing 512 uint16_t values (1024 bytes). The specific transposition that makes -the algorithm output naturally land in `[x][y][z]` memory order is: organize input as -`inputWords[y]` with bit `z*8+x` = B[x][y][z] (making y the word index, z the byte index, -and x the step variable). Transposing 64 bytes is intrinsically cheaper than transposing -1024 bytes, and the 8×8 bit-matrix transpose per y-slice is a well-studied ~10–15 instruction -operation. - -**Key tradeoff — good output order ↔ simple rectangle→linear fixup:** - -With the original layout (word=x, byte=y, step=z), the 2D rectangle is over (y, z) for fixed -x, and the rectangle→linear fixup collapses to the single byte-shift expression in §13f. - -With the bit-transposed layout (word=y, byte=z, step=x), the 2D rectangle is over (x, z) for -fixed y, and the "missing" terms for the linear prefix involve cross-word contributions from -all y-slices of preceding words — a significantly more complex expression that does not reduce -to a simple in-register byte operation. - -No 3D transposition of the input eliminates both costs simultaneously. The original layout -remains preferred for the simplicity of the fixup; the output transpose cost must be addressed -separately (either by tolerating it, using explicit intrinsics, or changing the consumer's -expected layout). - ---- - -## 14. Input Bit-Transpose: Decomposition and Implementation - -Although §13h concluded that the original layout (word=x) is preferred for fixup simplicity, -the input bit-transpose approach was further analyzed for completeness and because the output -transpose cost remains a concern. This section records the decomposition and implementation -decisions made during that analysis. - -### 14a. Target: y→z→x Layout - -To make the algorithm's output land naturally in `[x][y][z]` memory order (= NanoVDB linear -index order), the input words must be pre-arranged so that: - -``` -inputWords[y] bit (z*8 + x) = B[x][y][z] -``` - -i.e. word index = y (0..7), within-word byte index = z (0..7), within-byte bit index = x -(0..7). With this layout the algorithm's step variable becomes x, and -`data[x][y].ui8[z]` maps to voxel `(x, y, z)` — the standard linear-index order. - -This input transposition from the original NanoVDB layout (word=x, byte=y, bit=z) to the -target (word=y, byte=z, bit=x) is a 3-axis permutation of an 8×8×8 bit cube. It -decomposes into two independent transformations: - -1. **Step 1 — 8×8 byte-matrix transpose**: given `maskWords[x]` where byte y = B[x][y][:], - produce `tempWords[y]` where byte x = B[x][y][:]. (Only the byte-level arrangement - changes; bit ordering within each byte is unchanged.) - -2. **Step 2 — 8×8 bit-matrix transpose within each uint64**: given `tempWords[y]` where - byte x bit z = B[x][y][z], produce `inputWords[y]` where byte z bit x = B[x][y][z]. - (The byte→bit and bit→byte roles are swapped within each word.) - -### 14b. Step 2 — Bit-Matrix Transpose (Knuth 3-Round) - -This step is a standard 8×8 bit-matrix transpose applied independently to each of the 8 -uint64 words. The Knuth bit-interleaving algorithm uses three rounds of XOR/shift/AND: - -```cpp -static inline uint64_t transpose8x8bits(uint64_t x) -{ - // Round 1: swap 1×1 blocks at stride 1 within 2×2 tiles - uint64_t t = (x ^ (x >> 7)) & 0x00aa00aa00aa00aaULL; - x ^= t ^ (t << 7); - // Round 2: swap 2×2 blocks at stride 2 within 4×4 tiles - t = (x ^ (x >> 14)) & 0x0000cccc0000ccccULL; - x ^= t ^ (t << 14); - // Round 3: swap 4×4 blocks at stride 4 within 8×8 tiles - t = (x ^ (x >> 28)) & 0x00000000f0f0f0f0ULL; - x ^= t ^ (t << 28); - return x; -} -``` - -**Portability**: pure C++17, no intrinsics, no builtins. Under `#pragma omp simd` on 8 words -GCC emits ~36 scalar-width SIMD instructions (`vpsrlq`, `vpxor`, `vpand`, `vpsllq`) — fully -auto-vectorized. The 8 words are independent so there is no cross-element dependency. - -### 14c. Step 1 — Byte-Matrix Transpose (`__builtin_shufflevector`) - -The 8×8 byte-matrix transpose is a gather pattern: `tempWords[y]` byte x = byte y of -`maskWords[x]`. Compilers cannot auto-vectorize arbitrary gather patterns, so explicit -shuffle operations are required for SIMD throughput. - -On Clang, `__builtin_shufflevector` on `uint8_t __attribute__((vector_size(16)))` vectors -maps directly to architecture-appropriate byte-shuffle instructions (`vpunpcklbw`/`vpunpckhbw` -on x86, `vzip`/`vuzp` on ARM). On GCC, the equivalent is `__builtin_shuffle` with an integer -mask vector. Both builtins are already in the spirit of NanoVDB's existing use of -`__builtin_popcountll`, `__builtin_ctzl`, etc. in `nanovdb/util/Util.h`. - -The scalar fallback (64 independent byte moves) is branch-free and operates entirely on -L1-resident data — fast even without SIMD. - -Implementation with a scalar fallback and `NANOVDB_USE_INTRINSICS` guard (Clang path shown): - -```cpp -using u8x16 = uint8_t __attribute__((vector_size(16))); - -static void byteTranspose8x8(const uint64_t src[8], uint64_t dst[8]) -{ -#if defined(__clang__) && defined(NANOVDB_USE_INTRINSICS) - // Load 8 words as four 16-byte vectors (two words each) - u8x16 v01, v23, v45, v67; - __builtin_memcpy(&v01, src+0, 16); __builtin_memcpy(&v23, src+2, 16); - __builtin_memcpy(&v45, src+4, 16); __builtin_memcpy(&v67, src+6, 16); - - // Round 1: interleave bytes within each pair (vpunpcklbw / vpunpckhbw) - u8x16 t01 = __builtin_shufflevector(v01,v01, 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15); - u8x16 t23 = __builtin_shufflevector(v23,v23, 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15); - u8x16 t45 = __builtin_shufflevector(v45,v45, 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15); - u8x16 t67 = __builtin_shufflevector(v67,v67, 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15); - - // Round 2: gather 2-byte groups across pairs (vpunpcklwd / vpunpckhwd) - u8x16 q02lo = __builtin_shufflevector(t01,t23, 0,1,16,17, 2,3,18,19, 4,5,20,21, 6,7,22,23); - u8x16 q02hi = __builtin_shufflevector(t01,t23, 8,9,24,25,10,11,26,27,12,13,28,29,14,15,30,31); - u8x16 q46lo = __builtin_shufflevector(t45,t67, 0,1,16,17, 2,3,18,19, 4,5,20,21, 6,7,22,23); - u8x16 q46hi = __builtin_shufflevector(t45,t67, 8,9,24,25,10,11,26,27,12,13,28,29,14,15,30,31); - - // Round 3: gather 4-byte groups across quad-pairs (vpunpckldq / vpunpckhdq) - u8x16 r01 = __builtin_shufflevector(q02lo,q46lo, 0,1,2,3,16,17,18,19, 4,5,6,7,20,21,22,23); - u8x16 r23 = __builtin_shufflevector(q02lo,q46lo, 8,9,10,11,24,25,26,27,12,13,14,15,28,29,30,31); - u8x16 r45 = __builtin_shufflevector(q02hi,q46hi, 0,1,2,3,16,17,18,19, 4,5,6,7,20,21,22,23); - u8x16 r67 = __builtin_shufflevector(q02hi,q46hi, 8,9,10,11,24,25,26,27,12,13,14,15,28,29,30,31); - - __builtin_memcpy(dst+0,&r01,16); __builtin_memcpy(dst+2,&r23,16); - __builtin_memcpy(dst+4,&r45,16); __builtin_memcpy(dst+6,&r67,16); -#else - // Scalar fallback: 64 independent byte moves, L1-resident - const uint8_t* s = reinterpret_cast(src); - uint8_t* d = reinterpret_cast(dst); - for (int i = 0; i < 8; i++) - for (int j = 0; j < 8; j++) - d[i*8+j] = s[j*8+i]; -#endif -} -``` - -**GCC path**: replace `__builtin_shufflevector(a, b, ...)` with -`__builtin_shuffle((u8x16)(a), (u8x16)(b), (u8x16){...})` using the same index patterns. - -**AVX-512 note**: `__builtin_shufflevector` on 16-byte vectors emits fixed-width 128-bit -instructions. Unlike `#pragma omp simd` loops (which the compiler may promote to 256- or -512-bit), explicit `__builtin_shufflevector` calls on `vector_size(16)` remain 128-bit even -when targeting AVX-512. For AVX-512 width, 32-byte (`vector_size(32)`) vectors would be -needed, processing two 8-word groups per instruction. - -### 14d. Full Pipeline - -With the input bit-transpose in place, the complete algorithm for a single leaf becomes: - -``` -1. byteTranspose8x8(maskWords, tempWords) // Step 1: byte-matrix transpose -2. for y in 0..7: inputWords[y] = transpose8x8bits(tempWords[y]) // Step 2: bit-matrix transpose -3. computeZYPrefix(inputWords, data) // Z-pass + Y-pass (§13b/13c, step variable = x) -4. Rectangle→linear fixup (§13f formula, over (x,z) plane for fixed y — complexity TBD) -5. Zero-extend data[x][y].ui8[z] → uint16_t prefixSums[x*64 + y*8 + z] (vpmovzxbw, unit-stride) -6. Add mPrefixSum cross-word offsets // 8 groups × 64 uint16_t additions, auto-vectorizable -``` - -**Step 5 (zero-extension)**: with output naturally in `[x][y][z]` order, the zero-extension -from packed uint8_t to uint16_t is a unit-stride `vpmovzxbw` over 64 contiguous bytes — no -reordering, trivially auto-vectorizable. - -**Step 6 (cross-word offsets)**: add a constant (from `mPrefixSum`, up to 9 bits) to each of -the 8 groups of 64 uint16_t values — 8 broadcast-and-add SIMD operations, trivially -auto-vectorizable. - -### 14e. Open Question: Fixup Formula with y→z→x Layout - -The rectangle→linear fixup (step 4) is well-understood for the original layout (§13f): -with step variable = z, the 2D rectangle is over (y, z), and the correction is a simple -byte-shift within the same word. - -With the y→z→x layout and step variable = x, the 2D rectangle is over (x, z) within each -y-row. The "missing" contribution for the linear prefix consists of the complete earlier -y-rows (y' < y) at all z values — contributions that live in *different words* of `data`. -Whether this can be expressed as a comparably cheap byte-parallel operation on the register- -resident `data` array is pending analysis. - -### 14f. Cost Summary - -| Step | Cost (approx) | Portability | -|------|--------------|-------------| -| Step 1 (byte transpose) | ~12 shuffles (SIMD) / 64 moves (scalar) | `__builtin_shufflevector` (Clang) or `__builtin_shuffle` (GCC); scalar fallback | -| Step 2 (bit transpose) | ~36 SIMD ops (`#pragma omp simd`) | Pure C++17; auto-vectorized | -| Z-pass + Y-pass | ~14 cycles / ~56 SIMD ops | Pure C++17; auto-vectorized | -| Fixup | TBD (§14e) | TBD | -| Zero-extension | ~4 `vpmovzxbw` | Auto-vectorized | -| Cross-word offsets | ~8 broadcast+add | Auto-vectorized | - ---- - -## 15. Implementation Status and Next Steps (as of 2026-03-23) - -### 15a. Completed - -- **`nanovdb/util/Transpose.h`** — `transposeBits8x8` (Knuth 3-round, pure C++17, - `__hostdev__`) and `transposeBytes8x8` (3-round butterfly via `__builtin_shuffle` / - `__builtin_shufflevector` with scalar fallback). Both functions follow the - `nanovdb::util` free-function style of `Util.h`. Correctness verified by - `simd_test/transpose_test.cpp` on SIMD and scalar paths. Assembly inspected for - AVX2 (21 instructions) and AVX-512 (27 instructions, fixed 128-bit width). - -- **`simd_test/within_word_prefix_test.cpp`** — correctness test for `computeZYPrefix` - (z-pass + y-pass). Confirms the algorithm produces the 2D rectangle inclusive sum - `data[z][x].ui8[y]` at 100% accuracy; confirms ~74% discrepancy vs linear inclusive - prefix (as expected). - -- Input bit-transpose (`transposeBytes8x8` + `transposeBytes8x8`) maps `maskWords[x]` - (word=x, byte=y, bit=z) → `inputWords[y]` (word=y, byte=z, bit=x), so that the - z-pass + y-pass output lands naturally in `data[x][y].ui8[z]` = linear `x*64+y*8+z` - order. - -### 15b. Next Steps - -1. **Rectangle→linear fixup (§14e)** — work out the fixup formula for the y→z→x layout. - With step variable = x, the 2D rectangle is over (z, x) within each y-row; the - "missing" contribution (complete earlier y-rows at all z) comes from different words - of `data` and does not reduce to the simple byte-shift of §13f. This is the key - open design question before the pipeline is complete. - -2. **Zero-extension** — expand `data[x][y].ui8[z]` (byte-packed, 64 bytes) to - `uint16_t prefixSums[512]` in `x*64+y*8+z` linear order. Already in the correct - memory order after the input bit-transpose; trivially auto-vectorizable via - `vpmovzxbw`. - -3. **Cross-word offset addition** — add the 9-bit cumulative `mPrefixSum` offsets - (one per x-group of 64 voxels) to the uint16_t array. 8 broadcast-and-add - operations; trivially auto-vectorizable. - -4. **Inclusive→exclusive conversion** — subtract the active bit at each position - (`(maskWords[x] >> (y*8+z)) & 1`) to convert from inclusive to exclusive prefix, - matching `LeafData::getValue()` semantics. - -5. **End-to-end correctness test** — integrate all steps and verify against the - reference `getValue()` loop for random `Mask<3>` inputs. - ---- - -## 16. Plan #1 — x-major Layout (`data[x][y].ui8[z]`) - -An alternative to the original `data[z][x]` algorithm (§13) that keeps the -native x-y-z mask-word ordering, requires no input bit-transpose, and produces -the linear inclusive prefix sum **directly** (no rectangle→linear fixup). - -### 16a. Layout - -```cpp -union qword { uint64_t ui64; uint8_t ui8[8]; }; -qword data[8][8]; // data[x][y].ui8[z] ↔ voxel (x, y, z) -``` - -- `x` (0..7): word index — outer array dimension, slow index -- `y` (0..7): byte-within-word — inner array dimension -- `z` (0..7): bit-within-byte — **byte index** within the uint64 - -For fixed `x`: `data[x][0..7]` is 64 contiguous bytes (one cache line), enabling -`#pragma omp simd` over `y`. The byte index `z` lives *inside* each uint64, so -the Hillis-Steele within-uint64 scan naturally operates along `z`. - -### 16b. Algorithm - -``` -Step 1 — Indicator fill (scalar triple loop; optimize later): - data[x][y].ui8[z] = (maskWords[x] >> (y*8 + z)) & 1 = I[x][y][z] - -Step 2 — Z-pass: Hillis-Steele inclusive prefix sum over z within each uint64. - for x in 0..7: - for y in 0..7: ← simd-vectorizable (contiguous, no dep between y) - data[x][y].ui64 += data[x][y].ui64 << 8 - data[x][y].ui64 += data[x][y].ui64 << 16 - data[x][y].ui64 += data[x][y].ui64 << 32 - After: data[x][y].ui8[z] = Σ_{z'=0..z} I[x][y][z'] - Bonus: data[x][y].ui8[7] = full row y popcount (free). - -Step 3 — Y-pass: exclusive row-prefix scan + broadcast. - 3a. Extract row popcounts: - shifts[x][y].ui64 = data[x][y].ui64 >> 56 (byte 0 = row popcount, rest = 0) - - 3b. Exclusive y-prefix scan of shifts: - rowOffset[x][0] = 0 - rowOffset[x][y] = rowOffset[x][y-1] + shifts[x][y-1] for y = 1..7 - Sequential over y (loop-carried); independent over x — with a transposed - [y][x] layout the inner x-loop is unit-stride and AVX2/AVX-512-vectorizable. - - 3c+3d. Broadcast byte 0 to all 8 bytes and add: - data[x][y].ui64 += rowOffset[x][y].ui64 * kSpread - After: data[x][y].ui8[z] = Σ_{y'> 9*(x-1)) & 0x1FF for x = 1..7 - prefixSum[x*64 .. x*64+63] += xOffset[x] (broadcast + vpaddw, 4 AVX2 ops/slice) - After: prefixSum[i] = full linear inclusive prefix count within the leaf at voxel i. -``` - -### 16c. Why No Rectangle→Linear Fixup - -In the original `data[z][x]` algorithm (§13), the Y-pass accumulates a 2D -rectangle sum and then a separate fixup step (§13f) corrects it to a linear sum. -In Plan #1, the Y-pass adds **complete row popcounts** (`data[x][y].ui8[7]` from -the Z-pass) as a scalar broadcast. The scalar added to row `y` is exactly -`Σ_{y'::getValue() - mOffset` -(exclusive), subtract the active bit: - -```cpp -prefixSum[x*64 + y*8 + z] -= (maskWords[x] >> (y*8+z)) & 1u; -``` - -For the `decodeInverseMaps` use case (building `leafLocalOffsets[]`) the inclusive -form is equally usable; the choice depends on the consumer's convention. - -### 16e. Reference and Correctness - -```cpp -// Linear inclusive prefix at (x, y, z): -// Safe mask form: (2ULL << bitPos) - 1u covers bits 0..bitPos. -// At bitPos=63: unsigned wrap gives 0xFFFFFFFFFFFFFFFF. ✓ -uint16_t ref = xOffset[x] + countOn64(maskWords[x] & ((2ULL << (y*8+z)) - 1u)); -``` - -Verified in `simd_test/plan1_prefix_test.cpp`: 512000/512000 positions correct -across 1000 random `Mask<3>`-equivalent inputs. - -### 16f. Indicator Fill — `scatterLSB` Vectorization - -The original scalar triple loop (Step 1) is replaced by a multiply-free bit-scatter -that eliminates the inner `z`-loop: - -```cpp -static inline uint64_t scatterLSB(uint64_t src) -{ - uint64_t x = src & 0xFFu; - // Stage 1: replicate into 16-bit pairs. - // Multiplier 2^0+2^14+2^28+2^42 = (1+2^14)(1+2^28); x≤8 bits so OR≡ADD. - // Emits vpsllq+vpaddq pairs under AVX2/AVX-512 (no vpmuludq needed). - x = (x | (x << 14) | (x << 28) | (x << 42)) & UINT64_C(0x0003000300030003); - // Stage 2: separate each pair into individual byte lanes (1+2^7). - x = (x | (x << 7)) & UINT64_C(0x0101010101010101); - return x; -} - -// Indicator fill (Step 1) — replaces triple loop: -for (int x = 0; x < 8; x++) { - #pragma omp simd - for (int y = 0; y < 8; y++) - data[x][y].ui64 = scatterLSB(maskWords[x] >> (y * 8)); -} -``` - -`scatterLSB(maskWords[x] >> (y*8))` extracts byte `y` of word `x` and scatters -its 8 bits into the LSB of each of the 8 output bytes. The `y`-loop is independent -for fixed `x` and vectorizes under `#pragma omp simd`; the 8 outer `x`-iterations -are fully independent, allowing the OOO engine to interleave multiply chains and -hide shift latency. - -GCC applies two automatic strength reductions: -- Stage-1 factoring: `(1+2^14)(1+2^28)` is computed as two `vpsllq`+`vpaddq` - pairs (4 instructions) rather than 3 shifts + 3 ORs (6 instructions). -- Z-pass fusion: indicator fill and Z-pass are emitted as a single fused block - with no intervening store/reload. - -### 16g. AVX2 Performance - -Benchmarked in `simd_test/plan1_prefix_bench.cpp` and `simd_test/step_timing_bench.cpp` -(`rdtsc`-based, 50 000–100 000 iterations, 256-entry pre-generated input buffer). - -**Per-step breakdown (inlined, 100 000 iterations):** - -| Step | Description | Cycles/call | -|------|-------------|-------------| -| 1+2 | Indicator fill + Z-pass (fused) | 55.9 | -| 3a | Extract row popcounts (`>> 56`) | 26.8 | -| 3b | Exclusive y-prefix scan | 35.2 | -| 3c+d | Broadcast + add | 31.2 | -| 4+5 | Zero-extend + xOffset add | 47.0 | -| **Total** | | **196 cycles** | - -The `noinline` version (as seen by an external caller) measures **~377 cycles**, -with the ~180-cycle penalty attributable to 192 stack spills generated by AVX2's -16-register file being insufficient to keep all 16 YMM data blocks live -simultaneously. - -**Comparison with legacy software-popcount approach:** -512 independent `countOn(maskWords[x] & prefix_mask)` calls × ~15 instructions -(software Hamming weight, no hardware `popcnt`) ≈ 1 500–2 500 cycles. -Plan #1 delivers a **~10–15× speedup** without requiring a `popcnt` instruction. - -**AVX-512 note:** GCC emits YMM instructions even with `-march=sapphirerapids`, -missing the opportunity to process all 8 `y` values per word in a single ZMM -operation. It does replace the 3-step Z-pass chain with `vpmullq %ymm_kSpread` -(recognising that `(1+2^8)(1+2^16)(1+2^32) = kSpread`), saving 5 instructions -per block. Explicit `__m512i` intrinsics would be needed to unlock the full -ZMM path and eliminate register spills. - ---- - -## 17. 513-Entry Exclusive Prefix Layout and shfl_down Compaction - -### 17a. 513-Entry Exclusive Prefix Layout (initial design) - -The initial design allocates a 513-entry array, sets `prefixSums[0] = 0`, and passes -`prefixSums + 1` to `buildMaskPrefixSums`: - -```cpp -uint16_t prefixSums[513]; -prefixSums[0] = 0; -util::buildMaskPrefixSums(leaf.valueMask(), leaf.data()->mPrefixSum, prefixSums + 1); -``` - -Result after the call: -- `prefixSums[i]` = exclusive prefix at position i = 0-based rank of active voxel i. -- `prefixSums[i+1]` = inclusive prefix at position i (what buildMaskPrefixSums wrote). -- `prefixSums[512]` = total active voxel count of the leaf. - -For the shfl_down compaction, `shifts[i]` = number of inactive positions in [0..i-1]: -```cpp -shifts[i] = i - prefixSums[i] -``` -This was the approach used in the initial shfl_down implementation. See §17g for the -refined design that eliminates `prefixSums[]` and the explicit subtraction loop. - -### 17b. shfl_down Predicate -- Source vs. Destination - -Section 4e describes the predicate as `(shifts[i] & (1< -static void shflDownSep(const uint16_t* __restrict__ src, - const uint16_t* __restrict__ shifts, - uint16_t* __restrict__ dst) -{ - #pragma omp simd - for (int j = 0; j < 512 - Shift; j++) { - const uint16_t m = static_cast( - -static_cast((shifts[j + Shift] & static_cast(Shift)) != 0)); - dst[j] = (src[j + Shift] & m) | (src[j] & ~m); - } - for (int j = 512 - Shift; j < 512; j++) - dst[j] = src[j]; -} -``` - -**Arithmetic mask derivation**: `(shifts[j+Shift] & Shift) != 0` produces 0 or 1 (int). -Negating as int gives 0 or -1 = 0x00000000 or 0xFFFFFFFF. Truncating to uint16_t gives -0x0000 or 0xFFFF. The bitwise blend `(src[j+Shift] & m) | (src[j] & ~m)` then selects -the source or destination without a branch. GCC recognizes this as vpblendvb. - -**Critical CMake fix**: `#pragma omp simd` requires `-fopenmp` to be passed to the host -compiler. For CUDA source files, CMake does NOT automatically add `-Xcompiler -fopenmp` -even when `OpenMP::OpenMP_CXX` is linked. The CMakeLists.txt must explicitly set: -```cmake -$<$:-Xcompiler=-mavx2,-fopenmp> -``` -Without this, `#pragma omp simd` is treated as an unknown pragma and silently ignored, -causing the loop to compile as fully scalar code (~250ms vs ~15ms measured). - -### 17d. Full shfl_down Compaction (9 Passes, Ping-Pong Buffers) - -The 9 passes alternate between two buffers so each call has fully separate __restrict__ -source and destination pointers. buf0 is initialized with the identity; after 9 passes -(odd count), the result is in buf1. See §17g for the current form of the preamble that -builds `shifts[]`. - -```cpp -uint16_t buf0[512], buf1[512]; -for (int i = 0; i < 512; i++) buf0[i] = static_cast(i); - -shflDownSep< 1>(buf0, shifts, buf1); -shflDownSep< 2>(buf1, shifts, buf0); -shflDownSep< 4>(buf0, shifts, buf1); -shflDownSep< 8>(buf1, shifts, buf0); -shflDownSep< 16>(buf0, shifts, buf1); -shflDownSep< 32>(buf1, shifts, buf0); -shflDownSep< 64>(buf0, shifts, buf1); -shflDownSep<128>(buf1, shifts, buf0); -shflDownSep<256>(buf0, shifts, buf1); - -const uint16_t* leafLocalOffsets = buf1; -``` - -### 17e. Range Intersection and Output Fill - -The contiguous-copy approach (sections 3, 6, 7) replaces the per-voxel scatter. Two VBM -invariants simplify the early-exit (see §17g): -- No leaf has zero active voxels. -- Active voxel ranges across leaves are contiguous and monotonically ordered. - -These guarantee that no leaf in the iteration range is entirely before the block, so the -only guard needed is a `break` (not `continue`) when a leaf starts at or after the block end: - -```cpp -if (leafFirstOffset >= blockFirstOffset + BlockWidth) break; -``` - -Range intersection and output: -```cpp -const uint64_t globalStart = std::max(leafFirstOffset, blockFirstOffset); -const uint64_t globalEnd = std::min(leafFirstOffset + leafValueCount, - blockFirstOffset + BlockWidth); -const uint64_t jStart = globalStart - leafFirstOffset; -const uint64_t pStart = globalStart - blockFirstOffset; -const uint64_t count = globalEnd - globalStart; - -std::fill(leafIndex + pStart, leafIndex + pStart + count, (uint32_t)leafID); -std::copy(leafLocalOffsets + jStart, leafLocalOffsets + jStart + count, voxelOffset + pStart); -``` - -### 17f. Performance History - -**Performance (2M voxels / 16384 blocks / 25% occupancy / 24 OMP threads / AVX2):** -- Original `getValue()` loop: ~77 ms -- `buildMaskPrefixSums` + bit-scan scatter: ~65 ms -- shfl_down without vectorization (in-place, no __restrict__): ~250 ms -- shfl_down with proper vectorization (two-buffer __restrict__ + -fopenmp): ~15-20 ms -- After §17g refactor (buildMaskPrefixSums, no prefixSums[]): ~14-20 ms (no regression) - -**Key lessons:** -1. The in-place single-buffer form does NOT vectorize; two buffers + __restrict__ required. -2. `#pragma omp simd` requires -Xcompiler=-fopenmp in the CUDA host compile flags. -3. The arithmetic mask form (-(cond != 0)) is needed to avoid branch-vs-blend ambiguity. - -### 17g. Elimination of prefixSums[] via buildMaskPrefixSums - -`shifts[i]` = exclusive count of 0-bits (inactive voxels) at positions 0..i-1 is exactly -what `buildMaskPrefixSums` produces when run over the bitwise complement of the mask. -Adding a `template ` parameter to `buildMaskPrefixSums` allows -writing `shifts[]` directly, eliminating the `prefixSums[513]` array and the explicit -`shifts[i] = i - prefixSums[i]` subtraction loop: - -```cpp -uint16_t shifts[513]; -shifts[0] = 0; -util::buildMaskPrefixSums(leaf.valueMask(), leaf.data()->mPrefixSum, shifts + 1); - -const uint16_t leafValueCount = static_cast(512u) - shifts[512]; -``` - -Result: `shifts[i]` = exclusive 0-bit prefix at i for i=0..511 (used by `util::shuffleDownMask`). -`shifts[512]` = total inactive count; `leafValueCount` falls out as `512 - shifts[512]`. - -**How `buildMaskPrefixSums` works**: two changes from the default (`Invert=false`): -1. Step 1 (indicator fill): inverts the mask word before transposing: `~maskWords[x]`. -2. Step 5 (cross-word offsets): for word x, the exclusive 0-bit count equals - `64*x - ones`, where `ones` is the exclusive 1-bit count decoded from `mPrefixSum`. - The original (non-inverted) `mPrefixSum` field is passed unchanged by the caller. - -**Stack savings**: eliminates 1026 bytes (`prefixSums[513]`) and one 512-iteration pass. -The `shifts[513]` array (1026 bytes) replaces the old `shifts[512]` (1024 bytes) at -negligible cost (+2 bytes) while removing the need for `prefixSums[]` entirely. - -### 17h. shflDown → util::shuffleDownMask; generalize and promote to utility - -The single-buffer `shflDown` (introduced in the §17d revision) is generalized and -promoted to a free function `nanovdb::util::shuffleDownMask` in `VoxelBlockManager.h` -(a candidate for a future `nanovdb/util/Algo.h`). The final signature: - -```cpp -template -inline void shuffleDownMask(DataT* NANOVDB_RESTRICT data, - const MaskT* NANOVDB_RESTRICT masks, - MaskT maskBits) -{ - static_assert(Shift > 0 && Shift < N, "Shift must satisfy 0 < Shift < N"); - static_assert(std::is_unsigned_v, "DataT must be an unsigned integer type"); - static_assert(std::is_unsigned_v, "MaskT must be an unsigned integer type"); - #pragma omp simd - for (int j = 0; j < N - Shift; j++) { - const DataT m = (masks[j + Shift] & maskBits) != 0 ? ~DataT{0} : DataT{0}; - data[j] = (data[j + Shift] & m) | (data[j] & ~m); - } -} -``` - -**Changes from the §17d single-buffer form**: -- `shifts` → `masks` (parameter name); `& Shift` → `& maskBits` (runtime parameter, no - default — caller is explicit). `maskBits` is always a literal at every call site so - the compiler constant-folds it. -- Generalized from `uint16_t` to `DataT` / `MaskT` (any unsigned integer types). -- Generalized from hardcoded `512` to template parameter `N`. -- Blend mask uses `~DataT{0}` / `DataT{0}` (all-ones / all-zeros of the correct width) - instead of the `-static_cast(bool)` trick — clearer and equally efficient. -- `static_assert` guards on `Shift > 0 && Shift < N` and unsigned types. -- Extracted from `VoxelBlockManager` private static → `nanovdb::util` free function. -- `NANOVDB_RESTRICT` replaces `__restrict__` for portability (MSVC uses `__restrict`). -- Renamed `shflDown` → `shuffleDownMask`: "shuffle down" follows the CUDA - `__shfl_down_sync` convention; "Mask" denotes the predicate table parameter. - The operation is a conditional fixed-distance gather from higher-indexed positions, - not an arbitrary permutation — "shuffle" is preferred over "shift" because of the - conditional, data-dependent nature of the movement. - -**Call site**: -```cpp -util::shuffleDownMask<512, 1>(buf, shifts, uint16_t{ 1}); -// ... through ... -util::shuffleDownMask<512, 256>(buf, shifts, uint16_t{256}); -``` -No behavioral change from §17d. - -**Future consideration — header migration**: `shuffleDownMask` currently lives in -`VoxelBlockManager.h` because that is where it was first needed. It is a generic, -VBM-independent primitive with no dependency on any VBM type. A natural future home -is a dedicated utility header such as `nanovdb/util/Algo.h` (does not yet exist) or -`nanovdb/util/Util.h`. Migration should be deferred until at least one second call -site emerges outside the VBM, to avoid premature abstraction. From aae9b80fd71659d4af9aa336e933308290fe748d Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Thu, 2 Apr 2026 14:06:46 -0500 Subject: [PATCH 05/60] ex_voxelBlockManager_host_cuda: add StencilGather planning doc Design reference for the per-block stencil gather kernel: decodes inverse maps into block-local scratch, then resolves neighbor leaf pointers and fills N-point stencil index arrays for all active voxels in the block. WENO5 (N=19, R=3) is the motivating instance; architecture is stencil-agnostic. Covers GPU inner loop, CPU SIMD batch design (SIMDw=16, probeLeaf dedup), unified StencilLeafPtrs template, and reach-R generalization considerations. Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Efty Sifakis --- .../StencilGather.md | 212 ++++++++++++++++++ 1 file changed, 212 insertions(+) create mode 100644 nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilGather.md diff --git a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilGather.md b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilGather.md new file mode 100644 index 0000000000..585ce4d6c4 --- /dev/null +++ b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilGather.md @@ -0,0 +1,212 @@ +# Per-Block Stencil Gather + +This document is the design and planning reference for the per-block stencil gather +kernel — the operation that, given a built VBM and a block ID, computes the neighbor +index sets for all active voxels in that block under a given stencil shape. It is +written as dense, agent-consumable facts and design decisions. + +The WENO5 19-point stencil (±3 along each axis independently) is the motivating +instantiation, but the architecture is stencil-agnostic. The stencil shape enters +only as a compile-time parameter governing the number of output slots and the +neighbor leaf resolution logic. + +--- + +## 1. Scope and Place in the Architecture + +The stencil gather sits at the **second level** of the two-level VBM parallelism +hierarchy: + +| Level | Operation | Parallelism | +|-------|-----------|-------------| +| System | `buildVoxelBlockManager` | Threading (TBB/CUDA grid) over blocks | +| Per-block | `decodeInverseMaps` + stencil gather | SIMD/SIMT within one block | + +The stencil gather: +- **Assumes** the VBM has already been built (`firstLeafID[]`, `jumpMap[]` populated). +- **Is called** once per voxel block, from one CPU thread or one CUDA CTA. +- **Uses** no inter-block communication and holds no state beyond its call. +- **Is not** responsible for launching threads or distributing work across blocks. + That is the caller's responsibility (a future launcher, analogous to + `buildVoxelBlockManager`). + +--- + +## 2. Per-Block Execution Model + +Within one call (one CPU thread / one CUDA CTA): + +1. **Decode inverse maps** into block-local storage: + - GPU: `smem_leafIndex[BlockWidth]` / `smem_voxelOffset[BlockWidth]` in shared + memory, filled cooperatively by the CTA via `decodeInverseMaps`. + - CPU: `leafIndex[BlockWidth]` / `voxelOffset[BlockWidth]` on the stack + (cache-resident), filled by a single call to `decodeInverseMaps`. +2. **Loop over active voxels** in the block (positions where + `leafIndex[p] != UnusedLeafIndex`). +3. **For each active voxel**: resolve the neighbor leaf pointers for the stencil + shape, then fill the stencil's N-entry index array. + +**Key invariant on intermediate storage**: `leafIndex` and `voxelOffset` are scratch +only. They do not persist beyond this per-block call, and neither do any intermediate +neighbor-leaf-pointer structures. The stencil index arrays are the outputs. + +--- + +## 3. Stencil Parameterization + +A stencil is characterized by: + +- **N**: number of points (including center). +- **Point set**: compile-time set of relative (Δx, Δy, Δz) offsets, with a defined + mapping from each offset to an index in [0, N). +- **Reach R**: max |Δ| along any axis. Governs how many distinct neighbor leaves + per axis must be resolved (see §4). + +For the WENO5 stencil: N=19, reach R=3, point set = {0} ∪ {±1,±2,±3 along each +axis independently}, index mapping = `WenoPt::idx`. + +The index mapping convention is stencil-specific and must be documented per stencil. +In particular, `WenoPt::idx` (NanoVDB) is inconsistent with `NineteenPt::idx` +(OpenVDB) and must not be cross-used. + +--- + +## 4. Neighbor Leaf Resolution + +### 4a. How Many Leaf Neighbors Per Axis + +A leaf covers 8 positions along each axis. For a stencil with reach R, a voxel at +leaf-local position p along one axis needs neighbors at p-R .. p+R. The number of +distinct leaves touched along that axis depends on where p falls within the leaf: + +- For R ≤ 3 (e.g. WENO5): at most **one** neighbor leaf per axis (either lo or hi, + never both simultaneously, for any p in [0,7]). This is because the worst case + (p=0, reach=3) reaches p-3 = -3 (one leaf back) but p+3 = 3 (still in the same + leaf). +- For R > 4: a center voxel near the middle of a leaf can require neighbors in both + the lo and the hi neighboring leaf along the same axis simultaneously. + +The current `resolveLeafPtrs` design (`ptrs[axis][0..2]`: lo/center/hi) is correct +for R ≤ 3. A more general design would use `ptrs[axis][0..K]` where K = number of +neighbor leaves per axis. + +### 4b. resolveLeafPtrs — Design + +``` +resolveLeafPtrs(grid, leaf, voxelOffset) → StencilLeafPtrs +``` + +- Performs the minimum number of `probeLeaf` calls required by the stencil shape. +- For WENO5 (R=3): exactly **3 probeLeaf calls total** (one per axis), since at most + one neighbor leaf is needed per axis. +- Returns a `StencilLeafPtrs` struct whose layout is stencil-specific (see §5). +- Intentionally scalar: `probeLeaf` is pointer-chasing and not vectorizable. + +### 4c. computeStencil — Design + +``` +computeStencil(leaf, voxelOffset, leafPtrs, data[N]) +``` + +- Fills `data[N]` with global sequential indices for all N stencil points. +- Caller must zero-initialize `data[]`; entries for out-of-narrow-band neighbors + remain 0. +- Uses the stencil's index mapping (e.g. `WenoPt::idx`) throughout — + never hardcoded integers. +- This is the auto-vectorization target for the CPU port (see §6). + +--- + +## 5. StencilLeafPtrs Struct + +Unified template parameterized on build type and leaf pointer type, enabling both +scalar (GPU) and batched (CPU) instantiations from one definition: + +```cpp +template +struct StencilLeafPtrs { + LeafPtrT ptrs[3][3]; // [axis][slot]: slot 0=lo, 1=center, 2=hi +}; +``` + +- **GPU** (scalar per thread): `LeafPtrT = const NanoLeaf*` +- **CPU batch** (SIMDw lanes): `LeafPtrT = std::array*, SIMDw>` + +The `ptrs[3][3]` shape is correct for stencils with R ≤ 3. Larger stencils would +require a different slot count. + +The current GPU draft in `VoxelBlockManager.cuh` uses an unparameterized +`WenoLeafPtrs` (GPU-only, WENO5-specific). Generalizing to +`StencilLeafPtrs` is a prerequisite for the CPU implementation +and for supporting additional stencil shapes. + +--- + +## 6. GPU Inner Loop (Current Draft) + +After `decodeInverseMaps`, each thread with `smem_leafIndex[tID] != UnusedLeafIndex`: + +```cpp +const auto& leaf = tree.getFirstNode<0>()[smem_leafIndex[tID]]; +const uint16_t vo = smem_voxelOffset[tID]; + +uint64_t stencilData[N] = {}; +auto leafPtrs = VBM::resolveLeafPtrs(grid, leaf, vo); +VBM::computeStencil(leaf, vo, leafPtrs, stencilData); +``` + +No synchronization needed between decode and stencil steps beyond the `__syncthreads()` +already inside `decodeInverseMaps`. `resolveLeafPtrs` and `computeStencil` are both +per-thread and divergence-safe. + +--- + +## 7. CPU Inner Loop + +### 7a. SIMD Batch Width + +Process voxels in batches of `SIMDw = 16`. With AVX2 (16 × uint16_t per register), +each batch maps to one SIMD register width for `voxelOffset`. + +### 7b. probeLeaf Deduplication + +Within a batch of SIMDw=16 voxels, the neighbor coordinate along each axis (rounded +to leaf granularity) takes at most **2 distinct values** per axis. The result of each +`probeLeaf` call is broadcast to the lanes sharing that neighbor coordinate. + +For a stencil with R ≤ 3: ≤ 2 `probeLeaf` calls per axis × 3 axes = +**≤ 6 `probeLeaf` calls per batch** (vs up to 3×SIMDw for naive per-voxel approach). + +The deduplication bound depends on both SIMDw and leaf size (8). For larger SIMDw +or larger R, more distinct neighbor coordinates can appear per batch. + +### 7c. computeStencil Vectorization + +The outer loop over lanes (i = 0 .. SIMDw-1) calls `computeStencil` once per lane +with output into a SoA `stencilData[N][SIMDw]` array. Auto-vectorization strategy: + +- `[[clang::always_inline]]` on `computeStencil`. +- `__restrict__` on output pointers. +- `#pragma clang loop vectorize(enable) vectorize_width(16)` on the outer lane loop. +- Output via `std::array` (proven to vectorize; POD struct output + vectorizes the wrong dimension). + +--- + +## 8. Open Questions / Deferred Decisions + +- **Launcher design**: the system-level wrapper that dispatches per-block calls + (the `buildVoxelBlockManager` analogue for the stencil gather). Deferred until + the per-block kernel is validated. + +- **Index → value conversion**: `stencilData[N]` currently holds global sequential + indices. The PDE consumer wants `float` values. Whether the index-to-value lookup + (`grid->tree().getValue(idx)`) happens inside or outside this kernel is TBD. + +- **CPU `resolveLeafPtrs` batch function**: the per-batch deduplication logic (§7b) + needs its own function, separate from the GPU scalar `resolveLeafPtrs`. Signature + and deduplication algorithm TBD. + +- **Generalizing beyond R ≤ 3**: the `ptrs[3][3]` struct and single-neighbor-per-axis + assumption are baked into the current design. Any stencil with R > 4 would require + revisiting §4a and §5. From da91919248bd472a627e5435d390aac02727b0c5 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Thu, 2 Apr 2026 14:28:06 -0500 Subject: [PATCH 06/60] StencilGather: add stencil type interface and kernel/output design MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - §3: Stencil type as template parameter needs index→offsets direction (for-each-slot gather loop), not the offsets→index direction of WenoPt. Clarify relationship to BaseStencil/WenoStencil: geometry-only descriptor, no accessor coupling. - §4: Kernel lambda signature std::array kernel(const ValueType* u); output is homogeneous std::array (not tuple); K=1 degenerates to scalar; SoA output layout results[k][BlockWidth] for SIMD efficiency. - Renumber §4-§8 → §5-§9; update open questions accordingly. Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Efty Sifakis --- .../StencilGather.md | 180 ++++++++++++++---- 1 file changed, 142 insertions(+), 38 deletions(-) diff --git a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilGather.md b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilGather.md index 585ce4d6c4..467900b86c 100644 --- a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilGather.md +++ b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilGather.md @@ -1,14 +1,16 @@ # Per-Block Stencil Gather This document is the design and planning reference for the per-block stencil gather -kernel — the operation that, given a built VBM and a block ID, computes the neighbor -index sets for all active voxels in that block under a given stencil shape. It is -written as dense, agent-consumable facts and design decisions. +kernel — the operation that, given a built VBM, a block ID, and a user-supplied kernel +lambda, gathers stencil neighbor values for all active voxels in the block and +produces a per-voxel output array. It is written as dense, agent-consumable facts +and design decisions. The WENO5 19-point stencil (±3 along each axis independently) is the motivating instantiation, but the architecture is stencil-agnostic. The stencil shape enters -only as a compile-time parameter governing the number of output slots and the -neighbor leaf resolution logic. +as a compile-time template parameter governing the number of neighbor slots, the +neighbor leaf resolution logic, and the value fetch. The user supplies a scalar +kernel lambda that operates on the gathered values and produces the output. --- @@ -43,37 +45,133 @@ Within one call (one CPU thread / one CUDA CTA): (cache-resident), filled by a single call to `decodeInverseMaps`. 2. **Loop over active voxels** in the block (positions where `leafIndex[p] != UnusedLeafIndex`). -3. **For each active voxel**: resolve the neighbor leaf pointers for the stencil - shape, then fill the stencil's N-entry index array. +3. **For each active voxel**: resolve the neighbor leaf pointers, fetch the N + neighbor values into a local array, invoke the kernel lambda, and write the + output. -**Key invariant on intermediate storage**: `leafIndex` and `voxelOffset` are scratch -only. They do not persist beyond this per-block call, and neither do any intermediate -neighbor-leaf-pointer structures. The stencil index arrays are the outputs. +**Key invariant on intermediate storage**: `leafIndex`, `voxelOffset`, the +neighbor-leaf-pointer structs, and the per-voxel value arrays are all scratch only. +They do not persist beyond this per-block call. The kernel output array is the +only output. --- -## 3. Stencil Parameterization +## 3. Stencil Type as Template Parameter -A stencil is characterized by: +### 3a. What the Infrastructure Needs -- **N**: number of points (including center). -- **Point set**: compile-time set of relative (Δx, Δy, Δz) offsets, with a defined - mapping from each offset to an index in [0, N). -- **Reach R**: max |Δ| along any axis. Governs how many distinct neighbor leaves - per axis must be resolved (see §4). +The gather infrastructure iterates over stencil slots `n = 0 .. N-1` and for each +needs to know the Cartesian offset `(Δx, Δy, Δz)` to look up. The pipeline is: -For the WENO5 stencil: N=19, reach R=3, point set = {0} ∪ {±1,±2,±3 along each -axis independently}, index mapping = `WenoPt::idx`. +``` +for n in 0..N-1: + values[n] = grid.getValue(center + StencilT::offset(n)) +``` + +This requires **index → offsets** direction: given slot index n, return `(Δx, Δy, Δz)`. + +The existing `WenoPt::idx` (NanoVDB) and `NineteenPt::idx` (OpenVDB) +go in the **opposite** direction (offsets → index) and are primarily useful to the +user writing the kernel lambda (addressing a specific neighbor by name). They are +not directly usable by the infrastructure's gather loop. + +The stencil type must therefore expose a compile-time offset table: + +```cpp +// For each slot n in [0, N), the Cartesian offset +static constexpr std::array, N> offsets; +// or equivalently a static constexpr accessor: +static constexpr std::array offset(int n); +``` + +### 3b. Relationship to BaseStencil / WenoStencil + +`nanovdb::math::BaseStencil` and `WenoStencil` couple the +stencil geometry to a grid accessor (`mAcc`) via `init()` / `moveTo()`. This coupling +is incompatible with the VBM batch gather, where the infrastructure owns the value +lookup. + +What is reusable from the existing design: +- `SIZE` / `static constexpr int SIZE` — directly useful. +- `WenoPt::idx` / `pos()` — useful to the *user's kernel lambda* + for addressing neighbors by name, but not to the gather loop itself. + +The stencil type for our template parameter is a **geometry-only descriptor** — no +accessor, no stored values. It could be a thin wrapper around the existing types, +or a new family of types alongside them. + +### 3c. Stencil Characteristics + +- **N** (`SIZE`): number of points including center. +- **Offset table**: compile-time mapping from slot index → `(Δx, Δy, Δz)`. +- **Reach R**: `max |Δ|` over all axes and all slots. Governs neighbor leaf + resolution (see §5). + +For WENO5: N=19, R=3, offsets derived from `WenoPt` specializations. + +--- + +## 4. Kernel Lambda and Output Type + +### 4a. Kernel Lambda Signature + +The user supplies a kernel lambda with signature: + +```cpp +std::array kernel(const ValueType* u); +``` + +where `u[n]` is the grid value at stencil slot `n` (i.e. `u[0]` is the center, +`u[WenoPt<1,0,0>::idx]` is the +x neighbor for WENO5, etc.). The lambda is +completely unaware of indices, leaf pointers, or SIMD lanes. + +Example — Laplacian (K=1): +```cpp +auto laplacian = [](const float* u) -> std::array { + return { -6.f*u[0] + u[GradPt<1,0,0>::idx] + u[GradPt<-1,0,0>::idx] + + u[GradPt<0,1,0>::idx] + u[GradPt<0,-1,0>::idx] + + u[GradPt<0,0,1>::idx] + u[GradPt<0,0,-1>::idx] }; +}; +``` + +Example — gradient (K=3): +```cpp +auto grad = [](const float* u) -> std::array { + return { 0.5f*(u[GradPt<1,0,0>::idx] - u[GradPt<-1,0,0>::idx]), + 0.5f*(u[GradPt<0,1,0>::idx] - u[GradPt<0,-1,0>::idx]), + 0.5f*(u[GradPt<0,0,1>::idx] - u[GradPt<0,0,-1>::idx]) }; +}; +``` -The index mapping convention is stencil-specific and must be documented per stencil. -In particular, `WenoPt::idx` (NanoVDB) is inconsistent with `NineteenPt::idx` -(OpenVDB) and must not be cross-used. +### 4b. Output Type: std::array + +The output is always `std::array` — homogeneous in type. K=1 +degenerates naturally to the scalar case without special-casing. + +Heterogeneous output (e.g. `std::tuple`) is not needed for the typical PDE/level-set +workload: Laplacian (K=1), gradient (K=3), WENO upwind differences (K=6), curvature +components (K=2) are all uniform in type. A tuple would also defeat auto-vectorization. + +### 4c. Output Buffer Layout + +The per-block output is stored in SoA layout: + +``` +results[k][BlockWidth] for k = 0 .. K-1 +``` + +Each channel `k` is a contiguous array of `ValueType` across all BlockWidth voxel +positions, mapping cleanly to K independent SIMD registers. AoS layout +(`results[BlockWidth][K]`) would interleave channels and defeat SIMD. + +K is either deduced from the lambda's return type or supplied as an explicit template +parameter. --- -## 4. Neighbor Leaf Resolution +## 5. Neighbor Leaf Resolution -### 4a. How Many Leaf Neighbors Per Axis +### 5a. How Many Leaf Neighbors Per Axis A leaf covers 8 positions along each axis. For a stencil with reach R, a voxel at leaf-local position p along one axis needs neighbors at p-R .. p+R. The number of @@ -90,7 +188,7 @@ The current `resolveLeafPtrs` design (`ptrs[axis][0..2]`: lo/center/hi) is corre for R ≤ 3. A more general design would use `ptrs[axis][0..K]` where K = number of neighbor leaves per axis. -### 4b. resolveLeafPtrs — Design +### 5b. resolveLeafPtrs — Design ``` resolveLeafPtrs(grid, leaf, voxelOffset) → StencilLeafPtrs @@ -102,7 +200,7 @@ resolveLeafPtrs(grid, leaf, voxelOffset) → StencilLeafPtrs - Returns a `StencilLeafPtrs` struct whose layout is stencil-specific (see §5). - Intentionally scalar: `probeLeaf` is pointer-chasing and not vectorizable. -### 4c. computeStencil — Design +### 5c. computeStencil — Design ``` computeStencil(leaf, voxelOffset, leafPtrs, data[N]) @@ -117,7 +215,7 @@ computeStencil(leaf, voxelOffset, leafPtrs, data[N]) --- -## 5. StencilLeafPtrs Struct +## 6. StencilLeafPtrs Struct Unified template parameterized on build type and leaf pointer type, enabling both scalar (GPU) and batched (CPU) instantiations from one definition: @@ -142,7 +240,7 @@ and for supporting additional stencil shapes. --- -## 6. GPU Inner Loop (Current Draft) +## 7. GPU Inner Loop (Current Draft) After `decodeInverseMaps`, each thread with `smem_leafIndex[tID] != UnusedLeafIndex`: @@ -161,14 +259,14 @@ per-thread and divergence-safe. --- -## 7. CPU Inner Loop +## 8. CPU Inner Loop -### 7a. SIMD Batch Width +### 8a. SIMD Batch Width Process voxels in batches of `SIMDw = 16`. With AVX2 (16 × uint16_t per register), each batch maps to one SIMD register width for `voxelOffset`. -### 7b. probeLeaf Deduplication +### 8b. probeLeaf Deduplication Within a batch of SIMDw=16 voxels, the neighbor coordinate along each axis (rounded to leaf granularity) takes at most **2 distinct values** per axis. The result of each @@ -180,7 +278,7 @@ For a stencil with R ≤ 3: ≤ 2 `probeLeaf` calls per axis × 3 axes = The deduplication bound depends on both SIMDw and leaf size (8). For larger SIMDw or larger R, more distinct neighbor coordinates can appear per batch. -### 7c. computeStencil Vectorization +### 8c. computeStencil Vectorization The outer loop over lanes (i = 0 .. SIMDw-1) calls `computeStencil` once per lane with output into a SoA `stencilData[N][SIMDw]` array. Auto-vectorization strategy: @@ -193,20 +291,26 @@ with output into a SoA `stencilData[N][SIMDw]` array. Auto-vectorization strate --- -## 8. Open Questions / Deferred Decisions +## 9. Open Questions / Deferred Decisions - **Launcher design**: the system-level wrapper that dispatches per-block calls (the `buildVoxelBlockManager` analogue for the stencil gather). Deferred until the per-block kernel is validated. -- **Index → value conversion**: `stencilData[N]` currently holds global sequential - indices. The PDE consumer wants `float` values. Whether the index-to-value lookup - (`grid->tree().getValue(idx)`) happens inside or outside this kernel is TBD. +- **Stencil type definition**: the geometry-only stencil descriptor (§3) needs a + concrete C++ form — whether a new family of types, a thin wrapper around existing + `BaseStencil` specializations, or a standalone `constexpr` struct. The offset table + representation (`std::array, N>` vs a static `constexpr` accessor + function) is also TBD. + +- **K deduction vs explicit parameter**: whether K (output count) is deduced from the + lambda's return type via `decltype` / CTAD, or supplied as an explicit template + parameter alongside the stencil type. -- **CPU `resolveLeafPtrs` batch function**: the per-batch deduplication logic (§7b) +- **CPU `resolveLeafPtrs` batch function**: the per-batch deduplication logic (§8b) needs its own function, separate from the GPU scalar `resolveLeafPtrs`. Signature and deduplication algorithm TBD. - **Generalizing beyond R ≤ 3**: the `ptrs[3][3]` struct and single-neighbor-per-axis assumption are baked into the current design. Any stencil with R > 4 would require - revisiting §4a and §5. + revisiting §5a and §6. From 57dec68315a803716bee5000fd8cfd52e72cf028 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Thu, 2 Apr 2026 18:25:50 -0500 Subject: [PATCH 07/60] simd_test: add liftToSimd vectorization experiment and investigation notes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a self-contained test (lift_test.cpp) exploring a generic SIMD-lifting abstraction: given a scalar tuple→tuple kernel, liftToSimd produces an SoA-wide version that loops over W lanes and is the auto-vectorization target. The motivating kernel is WENO5 normSqGrad (19-point stencil, matching WenoStencil::normSqGrad from Stencils.h). The six weno5() calls vectorize cleanly; godunovsNormSqrd() blocks vectorization in two distinct ways depending on how it is written: 1. std::max / bool isOutside ternaries → "control flow in loop" 2. float sign + fmaxf (no ternaries) → "no vectype for stmt" due to GCC's inability to see through std::tuple's recursive-inheritance struct layout in GIMPLE alias analysis INVESTIGATION.md documents all experiments, findings, current blockers, and proposed next steps (pointer-cache approach, Clang comparison, etc.). Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Evangelos Sifakis Signed-off-by: Efty Sifakis --- simd_test/INVESTIGATION.md | 245 +++++++++++++++++++++++++++++++++++++ simd_test/lift_test.cpp | 218 +++++++++++++++++++++++++++++++++ 2 files changed, 463 insertions(+) create mode 100644 simd_test/INVESTIGATION.md create mode 100644 simd_test/lift_test.cpp diff --git a/simd_test/INVESTIGATION.md b/simd_test/INVESTIGATION.md new file mode 100644 index 0000000000..a59f3daa59 --- /dev/null +++ b/simd_test/INVESTIGATION.md @@ -0,0 +1,245 @@ +# liftToSimd Vectorization Investigation + +This document captures the design idea, experiments, findings, and open questions +from an in-progress investigation into auto-vectorizing a scalar stencil kernel +through a generic SIMD lifting abstraction. It is written as a reference for +resuming the investigation in a future session. + +--- + +## 1. Motivation + +The VoxelBlockManager CPU port (branch `vbm-cpu-port`) processes voxels in batches +of `SIMDw = 16` (one AVX2 register width of uint16_t). For each batch the same +stencil computation is applied to every lane. The goal is to write the stencil +physics **once** as a scalar, `__hostdev__`-compatible lambda (usable unmodified on +the GPU), and automatically derive an auto-vectorized CPU batch kernel from it. + +--- + +## 2. The `liftToSimd` Pattern + +### 2a. Core Idea + +A scalar kernel has signature: + +```cpp +ScalarTupleOut kernel(ScalarTupleIn); +``` + +where `ScalarTupleIn = std::tuple` and `ScalarTupleOut = std::tuple`. + +The SIMD version replaces every `T` in the tuple with `std::array`, giving an +SoA (struct-of-arrays) layout: + +```cpp +SimdTupleIn = std::tuple, std::array, ..., std::array> +SimdTupleOut = std::tuple, ...> +``` + +`liftToSimd(kernel)` returns a lambda that loops over lanes 0..W-1, extracts the +i-th element from each input array (forming a `ScalarTupleIn`), calls `kernel`, and +writes the result back into the i-th slot of each output array. The loop is the +auto-vectorization target. + +### 2b. Infrastructure + +```cpp +// ToSimdTuple, W>::type = tuple...> +template struct ToSimdTuple; + +// extractSlice: return tuple of the i-th elements from a tuple-of-arrays +template +auto extractSlice(const SimdTupleT& t, int i, std::index_sequence); + +// storeSlice: write a scalar tuple into the i-th slot of a SIMD tuple +template +void storeSlice(SimdTupleT& t, int i, const ScalarTupleT& s, std::index_sequence); + +template +auto liftToSimd(ScalarFn f) { + return [f](const auto& simdIn, auto& simdOut) { + constexpr auto inSize = std::tuple_size_v>; + constexpr auto outSize = std::tuple_size_v>; + for (int i = 0; i < W; i++) { + auto scalarIn = extractSlice(simdIn, i, std::make_index_sequence{}); + auto scalarOut = f(scalarIn); + storeSlice(simdOut, i, scalarOut, std::make_index_sequence{}); + } + }; +} +``` + +### 2c. Key Requirement: `__attribute__((noinline))` Wrapper + +The vectorization loop must live inside a `__attribute__((noinline))` function. +Without this, GCC constant-folds the entire computation (because the test's input +data is compile-time computable) and emits no packed instructions at all, making it +appear that vectorization failed when it actually never ran. + +--- + +## 3. Kernel Under Test: WENO5 `normSqGrad` + +The scalar kernel computes the Godunov upwind norm-squared gradient using WENO5 +differences, matching `WenoStencil::normSqGrad` from `nanovdb/math/Stencils.h`. + +**Inputs**: 19 floats `v0..v18` representing the center voxel and ±3 neighbors along +each axis (same layout as `WenoPt::idx`). + +**Computation**: +1. Six WENO5 calls → six upwind differences `dP_xm, dP_xp, dP_ym, dP_yp, dP_zm, dP_zp` +2. `godunovsNormSqrd(v0 > isoValue, dP_xm, ..., dP_zp)` → Godunov norm-squared +3. Scale by `invDx2` + +**Type aliases**: +```cpp +using WenoIn = std::tuple; // 19 floats +using WenoOut = std::tuple; +constexpr int W = 16; +using WenoSimdIn = ToSimdTuple::type; +using WenoSimdOut = ToSimdTuple::type; +``` + +--- + +## 4. Vectorization Experiments and Findings + +Compiled with: `g++ -O3 -march=native -std=c++17 -fopt-info-vec-missed` +Platform: x86-64, GCC 13, AVX2. + +### Experiment 1 — Simple Laplacian (baseline) + +Kernel: `v0 - 6*v1 + v2 + ...` (7-point stencil, pure arithmetic). +**Result: VECTORIZES.** Emits `ymm`-width `vfmadd*ps`, `vaddps`. + +### Experiment 2 — Six WENO5 calls, sum all six dP terms + +Kernel computes `dP_xm + dP_xp + ... + dP_zp` (no `godunovsNormSqrd`). +Each `weno5` call is ~12 fmas and 2 divisions — complex but purely arithmetic. +**Result: VECTORIZES.** Confirms the WENO5 computation itself is not the blocker. + +### Experiment 3 — Full `normSqGrad` with `v0 > isoValue` + +Adds `godunovsNormSqrd(v0 > isoValue, ...)` to the pipeline. +GCC reports: `not vectorized: control flow in loop.` (loop line 41) +**Result: DOES NOT VECTORIZE.** + +### Experiment 4 — Constant `true` instead of `v0 > isoValue` + +Replace `v0 > isoValue` with compile-time `true` to rule out the comparison as the +blocker. +GCC still reports: `not vectorized: control flow in loop.` +**Result: DOES NOT VECTORIZE.** +Conclusion: the issue is inside `godunovsNormSqrd`, not at the call site. + +### Experiment 5 — Replace `bool isOutside` with `float sign`, use `fmaxf` + +Reformulated `godunovsNormSqrd` to take `float sign` (+1.f/-1.f) and use `fmaxf` +instead of `std::max` and ternary operators: +```cpp +float xm = fmaxf( sign * dP_xm, 0.f); xm *= xm; +``` +GCC now reports a different error for the same loop: +``` +not vectorized: no vectype for stmt: + MEM [(const float &)simdIn_5(D)]._M_elems[_67] + scalar_type: const float +``` +**Result: DOES NOT VECTORIZE — but for a different reason.** +The `control flow` blocker is gone. A new blocker appears: GCC's vectorizer cannot +find a vector type for the struct member access through `std::tuple`'s +implementation-detail inheritance chain (`_Tuple_impl`, `_Head_base`). The +`(const float &)simdIn_5(D)` in the GIMPLE indicates the vectorizer is seeing the +parameter reference cast through the tuple internals and cannot determine the memory +access is stride-1. + +--- + +## 5. Current Blockers (in priority order) + +### Blocker A: `std::tuple` struct indirection + +`std::get(simdIn)[i]` for fixed k, varying i, is a stride-1 access into one of +the 19 contiguous `std::array` members of the tuple. GCC's vectorizer +fails to prove this because `std::tuple` in libstdc++ uses recursive inheritance +(`_Tuple_impl : _Tuple_impl, _Head_base`), and the +GIMPLE representation of member access through that chain is too opaque for the +vectorizer's alias analysis. + +**Hypothesis**: caching `.data()` pointers for each tuple element outside the hot +loop — so the loop only sees `inPtrs[k][i]` (simple indirect load) — may allow the +vectorizer to prove stride-1 access. This would require reworking `extractSlice` +and `storeSlice` to operate on pointer arrays rather than going through `std::get`. + +### Blocker B: `std::max` / ternary in `godunovsNormSqrd` + +Even before reaching the struct-access issue, the ternary-based `std::max(a, b)` in +`godunovsNormSqrd` generates control flow IR that blocks vectorization. Using +`fmaxf` (which maps to a hardware `maxss`/`maxps` instruction) removes this blocker. +The current file keeps `std::max` / `bool isOutside` for readability and correctness; +any vectorization-capable reformulation will need `fmaxf` or equivalent. + +--- + +## 6. Proposed Next Steps + +### Step 1 — Pointer-cache approach in `liftToSimd` + +Before the hot loop, extract pointers to each tuple element's underlying array: + +```cpp +// Before loop: +using ElemT = float; // known for homogeneous tuples +const ElemT* inPtrs[inSize]; +apply to index_sequence: inPtrs[Is] = std::get(simdIn).data(); + +// In loop body: +// access: inPtrs[k][i] — provably stride-1 for fixed k, varying i +``` + +Re-run `fopt-info-vec-missed` to see if the struct-access blocker disappears. + +### Step 2 — Combine with `fmaxf` in `godunovsNormSqrd` + +Once the struct-access blocker is cleared, reinstate the `fmaxf` / `float sign` +formulation and check whether the full `normSqGrad` kernel vectorizes. + +### Step 3 — Clang comparison + +Compile the same test with `clang++ -O3 -march=native -Rpass=loop-vectorize` to +determine whether this is a GCC-specific limitation or a fundamental IR issue. +Clang's vectorizer handles struct member accesses differently and may succeed where +GCC fails. + +### Step 4 — Restore `v0 > isoValue` + +Once the constant-sign version vectorizes, replace `1.f` with +`(v0 > isoValue) ? 1.f : -1.f` at the call site. This introduces a VCMPPS+BLENDVPS +at the call site but no branching inside the arithmetic, which the vectorizer should +handle as a blend. + +### Step 5 — Consider alternative abstraction: `const ValueType*` kernel + +`StencilGather.md §4a` already specifies the kernel lambda signature as +`std::array(const ValueType* u)` (raw pointer, not tuple). +If the tuple path proves too resistant to auto-vectorization, the SIMD lift can be +reformulated over flat `float[N][W]` SoA arrays instead. The `liftToSimd` idea +survives — the tuple input/output types would be replaced by flat arrays — but the +scalar lambda signature changes slightly. + +--- + +## 7. File Reference + +| File | Purpose | +|------|---------| +| `simd_test/lift_test.cpp` | Self-contained test: `liftToSimd` infrastructure + WENO5 normSqGrad kernel | +| `nanovdb/nanovdb/math/Stencils.h` | Original `weno5`, `GodunovsNormSqrd`, `WenoStencil::normSqGrad` | +| `nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilGather.md` | Per-block stencil gather design doc (kernel lambda spec, CPU batch strategy) | +| `nanovdb/nanovdb/tools/VoxelBlockManager.h` | CPU VBM implementation | + +Build command: +```sh +g++ -O3 -march=native -std=c++17 -fopt-info-vec-missed -o lift_test lift_test.cpp +``` diff --git a/simd_test/lift_test.cpp b/simd_test/lift_test.cpp new file mode 100644 index 0000000000..a582f29d7d --- /dev/null +++ b/simd_test/lift_test.cpp @@ -0,0 +1,218 @@ +#include +#include +#include +#include +#include +#include + +// --------------------------------------------------------------------------- +// Type transformation: replace each T in a tuple with std::array +// --------------------------------------------------------------------------- +template struct ToSimdTuple; +template +struct ToSimdTuple, W> { + using type = std::tuple...>; +}; + +// --------------------------------------------------------------------------- +// extractSlice: given a tuple of arrays, return a tuple of the i-th elements +// --------------------------------------------------------------------------- +template +auto extractSlice(const SimdTupleT& t, int i, std::index_sequence) { + return std::make_tuple(std::get(t)[i]...); +} + +// --------------------------------------------------------------------------- +// storeSlice: write a scalar tuple into the i-th slot of a SIMD tuple +// --------------------------------------------------------------------------- +template +void storeSlice(SimdTupleT& t, int i, const ScalarTupleT& s, std::index_sequence) { + ((std::get(t)[i] = std::get(s)), ...); +} + +// --------------------------------------------------------------------------- +// liftToSimd: lift a scalar tuple->tuple function to operate on W-wide arrays +// --------------------------------------------------------------------------- +template +auto liftToSimd(ScalarFn f) { + return [f](const auto& simdIn, auto& simdOut) { + constexpr auto inSize = std::tuple_size_v>; + constexpr auto outSize = std::tuple_size_v>; + for (int i = 0; i < W; i++) { + auto scalarIn = extractSlice(simdIn, i, std::make_index_sequence{}); + auto scalarOut = f(scalarIn); + storeSlice(simdOut, i, scalarOut, std::make_index_sequence{}); + } + }; +} + +// --------------------------------------------------------------------------- +// WENO5 upwind interpolation (from Stencils.h) +// --------------------------------------------------------------------------- +inline float weno5(float v1, float v2, float v3, float v4, float v5, float dx2 = 1.f) +{ + static constexpr float C = 13.f / 12.f; + const float eps = 1.0e-6f * dx2; + const float A1 = 0.1f / ((C*(v1-2*v2+v3)*(v1-2*v2+v3) + 0.25f*(v1-4*v2+3*v3)*(v1-4*v2+3*v3) + eps) * + (C*(v1-2*v2+v3)*(v1-2*v2+v3) + 0.25f*(v1-4*v2+3*v3)*(v1-4*v2+3*v3) + eps)); + const float A2 = 0.6f / ((C*(v2-2*v3+v4)*(v2-2*v3+v4) + 0.25f*(v2-v4)*(v2-v4) + eps) * + (C*(v2-2*v3+v4)*(v2-2*v3+v4) + 0.25f*(v2-v4)*(v2-v4) + eps)); + const float A3 = 0.3f / ((C*(v3-2*v4+v5)*(v3-2*v4+v5) + 0.25f*(3*v3-4*v4+v5)*(3*v3-4*v4+v5) + eps) * + (C*(v3-2*v4+v5)*(v3-2*v4+v5) + 0.25f*(3*v3-4*v4+v5)*(3*v3-4*v4+v5) + eps)); + return (A1*(2*v1 - 7*v2 + 11*v3) + A2*(5*v3 - v2 + 2*v4) + A3*(2*v3 + 5*v4 - v5)) / (6*(A1+A2+A3)); +} + +// --------------------------------------------------------------------------- +// GodunovsNormSqrd — blend formulation +// +// Computes both the outside and inside squared terms for each axis via +// ternary blend on isOutside. The intent is that each ternary compiles to +// vcmpps + vblendvps rather than a branch, but GCC's vectorizer currently +// still reports "control flow in loop" even when isOutside is a compile-time +// constant. See INVESTIGATION.md for the full vectorization story. +// --------------------------------------------------------------------------- +inline float godunovsNormSqrd(bool isOutside, + float dP_xm, float dP_xp, + float dP_ym, float dP_yp, + float dP_zm, float dP_zp) +{ + float xm = isOutside ? std::max( dP_xm, 0.f) * std::max( dP_xm, 0.f) + : std::max(-dP_xm, 0.f) * std::max(-dP_xm, 0.f); + float xp = isOutside ? std::max(-dP_xp, 0.f) * std::max(-dP_xp, 0.f) + : std::max( dP_xp, 0.f) * std::max( dP_xp, 0.f); + float ym = isOutside ? std::max( dP_ym, 0.f) * std::max( dP_ym, 0.f) + : std::max(-dP_ym, 0.f) * std::max(-dP_ym, 0.f); + float yp = isOutside ? std::max(-dP_yp, 0.f) * std::max(-dP_yp, 0.f) + : std::max( dP_yp, 0.f) * std::max( dP_yp, 0.f); + float zm = isOutside ? std::max( dP_zm, 0.f) * std::max( dP_zm, 0.f) + : std::max(-dP_zm, 0.f) * std::max(-dP_zm, 0.f); + float zp = isOutside ? std::max(-dP_zp, 0.f) * std::max(-dP_zp, 0.f) + : std::max( dP_zp, 0.f) * std::max( dP_zp, 0.f); + return std::max(xm, xp) + std::max(ym, yp) + std::max(zm, zp); +} + +// --------------------------------------------------------------------------- +// WenoNormSqGrad scalar lambda +// +// Input tuple indices follow WenoPt::idx: +// 0 = center (0, 0, 0) +// 1, 2, 3 = x-axis (-3,-2,-1) +// 4, 5, 6 = x-axis ( 1, 2, 3) +// 7, 8, 9 = y-axis (-3,-2,-1) +// 10,11,12 = y-axis ( 1, 2, 3) +// 13,14,15 = z-axis (-3,-2,-1) +// 16,17,18 = z-axis ( 1, 2, 3) +// --------------------------------------------------------------------------- +using WenoIn = std::tuple; +using WenoOut = std::tuple; + +// dx2 = dx^2 (scale for WENO eps), invDx2 = 1/dx^2, isoValue = level set iso +auto makeNormSqGrad(float dx2, float invDx2, float isoValue = 0.f) { + return [=](WenoIn in) -> WenoOut { + const float + v0 = std::get< 0>(in), + v1 = std::get< 1>(in), v2 = std::get< 2>(in), v3 = std::get< 3>(in), + v4 = std::get< 4>(in), v5 = std::get< 5>(in), v6 = std::get< 6>(in), + v7 = std::get< 7>(in), v8 = std::get< 8>(in), v9 = std::get< 9>(in), + v10 = std::get<10>(in), v11 = std::get<11>(in), v12 = std::get<12>(in), + v13 = std::get<13>(in), v14 = std::get<14>(in), v15 = std::get<15>(in), + v16 = std::get<16>(in), v17 = std::get<17>(in), v18 = std::get<18>(in); + + const float + dP_xm = weno5(v2-v1, v3-v2, v0-v3, v4-v0, v5-v4, dx2), + dP_xp = weno5(v6-v5, v5-v4, v4-v0, v0-v3, v3-v2, dx2), + dP_ym = weno5(v8-v7, v9-v8, v0-v9, v10-v0, v11-v10, dx2), + dP_yp = weno5(v12-v11, v11-v10, v10-v0, v0-v9, v9-v8, dx2), + dP_zm = weno5(v14-v13, v15-v14, v0-v15, v16-v0, v17-v16, dx2), + dP_zp = weno5(v18-v17, v17-v16, v16-v0, v0-v15, v15-v14, dx2); + + return { invDx2 * godunovsNormSqrd(v0 > isoValue, + dP_xm, dP_xp, dP_ym, dP_yp, dP_zm, dP_zp) }; + }; +} + +// --------------------------------------------------------------------------- +// SIMD wrapper +// --------------------------------------------------------------------------- +constexpr int W = 16; +using WenoSimdIn = typename ToSimdTuple::type; +using WenoSimdOut = typename ToSimdTuple::type; + +__attribute__((noinline)) +void runSimdNormSqGrad(const WenoSimdIn& simdIn, WenoSimdOut& simdOut, + float dx2, float invDx2, float isoValue) +{ + auto kernel = makeNormSqGrad(dx2, invDx2, isoValue); + auto simdKernel = liftToSimd(kernel); + simdKernel(simdIn, simdOut); +} + +// --------------------------------------------------------------------------- +// Reference: scalar normSqGrad directly on a float[19] array +// --------------------------------------------------------------------------- +float refNormSqGrad(const float* v, float dx2, float invDx2, float isoValue = 0.f) +{ + const float + dP_xm = weno5(v[2]-v[1], v[3]-v[2], v[0]-v[3], v[4]-v[0], v[5]-v[4], dx2), + dP_xp = weno5(v[6]-v[5], v[5]-v[4], v[4]-v[0], v[0]-v[3], v[3]-v[2], dx2), + dP_ym = weno5(v[8]-v[7], v[9]-v[8], v[0]-v[9], v[10]-v[0], v[11]-v[10], dx2), + dP_yp = weno5(v[12]-v[11], v[11]-v[10], v[10]-v[0], v[0]-v[9], v[9]-v[8], dx2), + dP_zm = weno5(v[14]-v[13], v[15]-v[14], v[0]-v[15], v[16]-v[0], v[17]-v[16], dx2), + dP_zp = weno5(v[18]-v[17], v[17]-v[16], v[16]-v[0], v[0]-v[15], v[15]-v[14], dx2); + return invDx2 * godunovsNormSqrd(v[0] > isoValue, + dP_xm, dP_xp, dP_ym, dP_yp, dP_zm, dP_zp); +} + +// --------------------------------------------------------------------------- +int main() +{ + const float dx = 0.1f, dx2 = dx*dx, invDx2 = 1.f/(dx*dx); + + // Fill 16 lanes with distinct synthetic level-set-like values + WenoSimdIn simdIn{}; + float refValues[W][19]; + + for (int i = 0; i < W; i++) { + // Smooth profile: v[n] ~ sin(n * 0.3 + i * 0.5) + for (int n = 0; n < 19; n++) { + float val = std::sin(n * 0.3f + i * 0.5f); + refValues[i][n] = val; + } + std::get< 0>(simdIn)[i] = refValues[i][ 0]; + std::get< 1>(simdIn)[i] = refValues[i][ 1]; + std::get< 2>(simdIn)[i] = refValues[i][ 2]; + std::get< 3>(simdIn)[i] = refValues[i][ 3]; + std::get< 4>(simdIn)[i] = refValues[i][ 4]; + std::get< 5>(simdIn)[i] = refValues[i][ 5]; + std::get< 6>(simdIn)[i] = refValues[i][ 6]; + std::get< 7>(simdIn)[i] = refValues[i][ 7]; + std::get< 8>(simdIn)[i] = refValues[i][ 8]; + std::get< 9>(simdIn)[i] = refValues[i][ 9]; + std::get<10>(simdIn)[i] = refValues[i][10]; + std::get<11>(simdIn)[i] = refValues[i][11]; + std::get<12>(simdIn)[i] = refValues[i][12]; + std::get<13>(simdIn)[i] = refValues[i][13]; + std::get<14>(simdIn)[i] = refValues[i][14]; + std::get<15>(simdIn)[i] = refValues[i][15]; + std::get<16>(simdIn)[i] = refValues[i][16]; + std::get<17>(simdIn)[i] = refValues[i][17]; + std::get<18>(simdIn)[i] = refValues[i][18]; + } + + WenoSimdOut simdOut{}; + runSimdNormSqGrad(simdIn, simdOut, dx2, invDx2, 0.f); + + printf("WenoNormSqGrad (W=%d, dx=%.2f):\n", W, dx); + bool allOk = true; + for (int i = 0; i < W; i++) { + float ref = refNormSqGrad(refValues[i], dx2, invDx2, 0.f); + float got = std::get<0>(simdOut)[i]; + bool ok = std::abs(got - ref) < 1e-5f * std::abs(ref) + 1e-10f; + printf(" lane %2d: %12.6f ref: %12.6f %s\n", i, got, ref, ok ? "OK" : "FAIL"); + allOk &= ok; + } + printf("\nOverall: %s\n", allOk ? "PASS" : "FAIL"); + return allOk ? 0 : 1; +} From fb395f14dc65c421bf1e2a78125f72a1cba89791 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Fri, 3 Apr 2026 14:34:42 -0500 Subject: [PATCH 08/60] simd_test: replace liftToSimd with generic-T Simd approach MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce nanovdb::util::Simd (simd_test/Simd.h) — a minimal header-only SIMD abstraction backed by std::array with arithmetic operators, SimdMask, min/max, and where(). Mirrors the C++26 std::simd interface for forward compatibility. Rewrite the WENO5 normSqGrad kernel as a template on T: - T=float : scalar __hostdev__ path for GPU (one thread per voxel) - T=Simd : W-wide CPU path (one call per batch) A single templated godunovsNormSqrd + normSqGrad definition serves both execution contexts with no #ifdef, structurally matching Stencils.h. Clang 18 vectorizes the Simd instantiation (691 ymm instructions in the hot function, assembly-verified); GCC 13 does not. Update INVESTIGATION.md with the full scoreboard, both approaches, and next steps (GCC intrinsics path, benchmarking, nanovdb/util/ integration). Signed-off-by: Efstathios Sifakis Signed-off-by: Efty Sifakis --- simd_test/INVESTIGATION.md | 320 ++++++++++++++++++------------------- simd_test/Simd.h | 153 ++++++++++++++++++ simd_test/lift_test.cpp | 276 +++++++++++++------------------- 3 files changed, 421 insertions(+), 328 deletions(-) create mode 100644 simd_test/Simd.h diff --git a/simd_test/INVESTIGATION.md b/simd_test/INVESTIGATION.md index a59f3daa59..373180030f 100644 --- a/simd_test/INVESTIGATION.md +++ b/simd_test/INVESTIGATION.md @@ -1,9 +1,9 @@ -# liftToSimd Vectorization Investigation +# liftToSimd / Generic-T SIMD Vectorization Investigation -This document captures the design idea, experiments, findings, and open questions +This document captures the design ideas, experiments, findings, and open questions from an in-progress investigation into auto-vectorizing a scalar stencil kernel -through a generic SIMD lifting abstraction. It is written as a reference for -resuming the investigation in a future session. +for the VoxelBlockManager CPU port. Written as a reference for resuming the +investigation in a future session. --- @@ -12,221 +12,214 @@ resuming the investigation in a future session. The VoxelBlockManager CPU port (branch `vbm-cpu-port`) processes voxels in batches of `SIMDw = 16` (one AVX2 register width of uint16_t). For each batch the same stencil computation is applied to every lane. The goal is to write the stencil -physics **once** as a scalar, `__hostdev__`-compatible lambda (usable unmodified on +physics **once** as a scalar, `__hostdev__`-compatible function (usable unmodified on the GPU), and automatically derive an auto-vectorized CPU batch kernel from it. --- -## 2. The `liftToSimd` Pattern +## 2. Approach A: `liftToSimd` Pattern (superseded) -### 2a. Core Idea +### Core Idea -A scalar kernel has signature: +A scalar kernel with signature `ScalarTupleOut kernel(ScalarTupleIn)` is lifted to +W lanes by replacing every `T` in the tuple types with `std::array` (SoA +layout). A W-iteration loop extracts the i-th element from each input array, calls +the scalar kernel, and stores results back. This loop is the auto-vectorization +target. ```cpp -ScalarTupleOut kernel(ScalarTupleIn); -``` - -where `ScalarTupleIn = std::tuple` and `ScalarTupleOut = std::tuple`. - -The SIMD version replaces every `T` in the tuple with `std::array`, giving an -SoA (struct-of-arrays) layout: - -```cpp -SimdTupleIn = std::tuple, std::array, ..., std::array> -SimdTupleOut = std::tuple, ...> -``` - -`liftToSimd(kernel)` returns a lambda that loops over lanes 0..W-1, extracts the -i-th element from each input array (forming a `ScalarTupleIn`), calls `kernel`, and -writes the result back into the i-th slot of each output array. The loop is the -auto-vectorization target. - -### 2b. Infrastructure - -```cpp -// ToSimdTuple, W>::type = tuple...> -template struct ToSimdTuple; - -// extractSlice: return tuple of the i-th elements from a tuple-of-arrays -template -auto extractSlice(const SimdTupleT& t, int i, std::index_sequence); - -// storeSlice: write a scalar tuple into the i-th slot of a SIMD tuple -template -void storeSlice(SimdTupleT& t, int i, const ScalarTupleT& s, std::index_sequence); - template auto liftToSimd(ScalarFn f) { return [f](const auto& simdIn, auto& simdOut) { - constexpr auto inSize = std::tuple_size_v>; - constexpr auto outSize = std::tuple_size_v>; for (int i = 0; i < W; i++) { - auto scalarIn = extractSlice(simdIn, i, std::make_index_sequence{}); + auto scalarIn = extractSlice(simdIn, i, ...); auto scalarOut = f(scalarIn); - storeSlice(simdOut, i, scalarOut, std::make_index_sequence{}); + storeSlice(simdOut, i, scalarOut, ...); } }; } ``` -### 2c. Key Requirement: `__attribute__((noinline))` Wrapper +### Outcome -The vectorization loop must live inside a `__attribute__((noinline))` function. -Without this, GCC constant-folds the entire computation (because the test's input -data is compile-time computable) and emits no packed instructions at all, making it -appear that vectorization failed when it actually never ran. +Clang 18 vectorizes the unmodified kernel (with `std::max` and `bool isOutside`) +producing a full ymm path with a runtime alias check. GCC 13 does not vectorize in +any attempted form (see §4). ---- +### Why Superseded -## 3. Kernel Under Test: WENO5 `normSqGrad` +The input/output types are `std::tuple&, ...>` — reference +tuples pointing into existing SoA buffers. While correct, the design has two +limitations: -The scalar kernel computes the Godunov upwind norm-squared gradient using WENO5 -differences, matching `WenoStencil::normSqGrad` from `nanovdb/math/Stencils.h`. +1. The scalar kernel is a separate code path from the GPU kernel — it takes a + tuple, not individual arguments, and cannot be templated on `T` directly. +2. Vectorization relies entirely on the auto-vectorizer seeing through the tuple + extraction loop, which GCC cannot do. -**Inputs**: 19 floats `v0..v18` representing the center voxel and ±3 neighbors along -each axis (same layout as `WenoPt::idx`). +--- -**Computation**: -1. Six WENO5 calls → six upwind differences `dP_xm, dP_xp, dP_ym, dP_yp, dP_zm, dP_zp` -2. `godunovsNormSqrd(v0 > isoValue, dP_xm, ..., dP_zp)` → Godunov norm-squared -3. Scale by `invDx2` +## 3. Approach B: Generic-T Pattern (current) + +### Core Idea + +Instead of lifting a scalar kernel into SIMD, write the kernel **once** as a +template on its value type `T`: -**Type aliases**: ```cpp -using WenoIn = std::tuple; // 19 floats -using WenoOut = std::tuple; -constexpr int W = 16; -using WenoSimdIn = ToSimdTuple::type; -using WenoSimdOut = ToSimdTuple::type; +template +T normSqGrad(T v0, T v1, ..., T v18, float dx2, float invDx2, float isoValue); ``` ---- +- `T = float` → scalar path, `__hostdev__`-compatible, used on GPU per-thread +- `T = Simd` → W-wide SIMD path, used on CPU per-batch -## 4. Vectorization Experiments and Findings +All arithmetic operators, `min`, `max`, and `where` are overloaded for both `float` +and `Simd`, so the same source compiles correctly for both contexts with +zero `#ifdef`. -Compiled with: `g++ -O3 -march=native -std=c++17 -fopt-info-vec-missed` -Platform: x86-64, GCC 13, AVX2. +### `where()` — the key primitive -### Experiment 1 — Simple Laplacian (baseline) +The `bool isOutside ? a : b` ternary cannot be used with a SIMD mask. `where(mask, +a, b)` replaces it: -Kernel: `v0 - 6*v1 + v2 + ...` (7-point stencil, pure arithmetic). -**Result: VECTORIZES.** Emits `ymm`-width `vfmadd*ps`, `vaddps`. +```cpp +// Scalar overload (T=float): plain ternary +template T where(bool mask, T a, T b) { return mask ? a : b; } -### Experiment 2 — Six WENO5 calls, sum all six dP terms +// SIMD overload (T=Simd): lane-wise blend → VBLENDVPS, no branch +template +Simd where(SimdMask mask, Simd a, Simd b); +``` -Kernel computes `dP_xm + dP_xp + ... + dP_zp` (no `godunovsNormSqrd`). -Each `weno5` call is ~12 fmas and 2 divisions — complex but purely arithmetic. -**Result: VECTORIZES.** Confirms the WENO5 computation itself is not the blocker. +`v0 > T(isoValue)` deduces to `bool` when `T=float` and `SimdMask` when +`T=Simd`, so the call to `where()` resolves correctly in both cases. -### Experiment 3 — Full `normSqGrad` with `v0 > isoValue` +### `nanovdb::util::Simd` -Adds `godunovsNormSqrd(v0 > isoValue, ...)` to the pipeline. -GCC reports: `not vectorized: control flow in loop.` (loop line 41) -**Result: DOES NOT VECTORIZE.** +A minimal header-only library (`simd_test/Simd.h`, destined for `nanovdb/util/`) +providing: -### Experiment 4 — Constant `true` instead of `v0 > isoValue` +| Component | Purpose | +|-----------|---------| +| `Simd` | W-wide vector backed by `std::array`; broadcast constructor, `operator[]`, `store()`, arithmetic operators | +| `SimdMask` | Lane-wise boolean result of comparisons | +| `min`, `max` | Lane-wise min/max; scalar overloads for GPU path | +| `where` | Lane-wise blend; scalar overload for GPU path | +| Mixed `T op Simd` / `Simd op T` overloads | Enable `2.f * simd_val` etc. without requiring implicit conversions in template deduction | -Replace `v0 > isoValue` with compile-time `true` to rule out the comparison as the -blocker. -GCC still reports: `not vectorized: control flow in loop.` -**Result: DOES NOT VECTORIZE.** -Conclusion: the issue is inside `godunovsNormSqrd`, not at the call site. +~150 lines total. `__hostdev__`-annotated throughout (macro-guarded for non-CUDA +builds). Mirrors C++26 `std::simd` naming deliberately — migration is a typedef. -### Experiment 5 — Replace `bool isOutside` with `float sign`, use `fmaxf` +### Kernel structure -Reformulated `godunovsNormSqrd` to take `float sign` (+1.f/-1.f) and use `fmaxf` -instead of `std::max` and ternary operators: ```cpp -float xm = fmaxf( sign * dP_xm, 0.f); xm *= xm; -``` -GCC now reports a different error for the same loop: -``` -not vectorized: no vectype for stmt: - MEM [(const float &)simdIn_5(D)]._M_elems[_67] - scalar_type: const float -``` -**Result: DOES NOT VECTORIZE — but for a different reason.** -The `control flow` blocker is gone. A new blocker appears: GCC's vectorizer cannot -find a vector type for the struct member access through `std::tuple`'s -implementation-detail inheritance chain (`_Tuple_impl`, `_Head_base`). The -`(const float &)simdIn_5(D)` in the GIMPLE indicates the vectorizer is seeing the -parameter reference cast through the tuple internals and cannot determine the memory -access is stride-1. - ---- +template +T godunovsNormSqrd(MaskT isOutside, + T dP_xm, T dP_xp, T dP_ym, T dP_yp, T dP_zm, T dP_zp) +{ + const T zero(0.f); + T outside = max(max(dP_xm,zero)*max(dP_xm,zero), min(dP_xp,zero)*min(dP_xp,zero)) + + ...; // y, z + T inside = max(min(dP_xm,zero)*min(dP_xm,zero), max(dP_xp,zero)*max(dP_xp,zero)) + + ...; // y, z + return where(isOutside, outside, inside); +} -## 5. Current Blockers (in priority order) +template +T normSqGrad(T v0, T v1, ..., T v18, float dx2, float invDx2, float isoValue) +{ + const T dP_xm = weno5(...), dP_xp = weno5(...); + const T dP_ym = weno5(...), dP_yp = weno5(...); + const T dP_zm = weno5(...), dP_zp = weno5(...); + return invDx2 * godunovsNormSqrd(v0 > T(isoValue), + dP_xm, dP_xp, dP_ym, dP_yp, dP_zm, dP_zp); +} +``` -### Blocker A: `std::tuple` struct indirection +This is structurally identical to `WenoStencil::normSqGrad` in `Stencils.h`. -`std::get(simdIn)[i]` for fixed k, varying i, is a stride-1 access into one of -the 19 contiguous `std::array` members of the tuple. GCC's vectorizer -fails to prove this because `std::tuple` in libstdc++ uses recursive inheritance -(`_Tuple_impl : _Tuple_impl, _Head_base`), and the -GIMPLE representation of member access through that chain is too opaque for the -vectorizer's alias analysis. +### GPU / CPU call sites -**Hypothesis**: caching `.data()` pointers for each tuple element outside the hot -loop — so the loop only sees `inPtrs[k][i]` (simple indirect load) — may allow the -vectorizer to prove stride-1 access. This would require reworking `extractSlice` -and `storeSlice` to operate on pointer arrays rather than going through `std::get`. +```cpp +// GPU: one thread per voxel, scalar instantiation +float result = normSqGrad(v[0], v[1], ..., v[18], dx2, invDx2, iso); -### Blocker B: `std::max` / ternary in `godunovsNormSqrd` +// CPU: one call per batch of W voxels, SIMD instantiation +using FloatSimd = nanovdb::util::Simd; +FloatSimd result = normSqGrad(sv[0], sv[1], ..., sv[18], dx2, invDx2, iso); +``` -Even before reaching the struct-access issue, the ternary-based `std::max(a, b)` in -`godunovsNormSqrd` generates control flow IR that blocks vectorization. Using -`fmaxf` (which maps to a hardware `maxss`/`maxps` instruction) removes this blocker. -The current file keeps `std::max` / `bool isOutside` for readability and correctness; -any vectorization-capable reformulation will need `fmaxf` or equivalent. +NVCC's demand-driven template instantiation ensures `normSqGrad` is +never compiled for device — it is only instantiated in host code. --- -## 6. Proposed Next Steps +## 4. Vectorization Experiments and Findings (Approach A) -### Step 1 — Pointer-cache approach in `liftToSimd` +Platform: x86-64, AVX2, Ubuntu. GCC 13. Clang 18. +Base flags: `-O3 -march=native -std=c++17` -Before the hot loop, extract pointers to each tuple element's underlying array: +> **Warning — GCC false positive diagnostics**: `-fopt-info-vec-missed` / `-fopt-info-vec` +> can report `optimized: loop vectorized using 32 byte vectors` for code *outside* the +> hot loop. Assembly inspection is the only ground truth — always verify with +> `grep -c 'ymm'` and confirm the instructions fall inside the target function. -```cpp -// Before loop: -using ElemT = float; // known for homogeneous tuples -const ElemT* inPtrs[inSize]; -apply to index_sequence: inPtrs[Is] = std::get(simdIn).data(); +| Experiment | Kernel | GCC | Clang | +|---|---|---|---| +| 1 | Simple Laplacian (pure arithmetic) | Yes | Yes | +| 2 | WENO5 sum, no conditionals | Yes | Yes | +| 3 | Full `normSqGrad`, `bool isOutside` | **No** (control flow) | **Yes** | +| 4 | Same, `isOutside` = constant `true` | No (control flow in `std::max`) | Yes | +| 5 | `fmaxf` + `float sign` | No (struct-access blocker) | Yes | +| 6 | `fmaxf` + `-ffinite-math-only` | No (false positive diagnostic) | Yes | +| 7 | `__attribute__((optimize("finite-math-only")))` | No (doesn't propagate) | Yes | +| 8 | `__builtin_fmaxf` + `float sign` | No (struct-access blocker) | Yes | +| 9 | Pointer-cache + `__builtin_fmaxf` | No (call-clobbers-memory) | Yes | +| 10 | Flat `float[N][W]` arrays | No (gather stride) | n/a | -// In loop body: -// access: inPtrs[k][i] — provably stride-1 for fixed k, varying i -``` +**Conclusion for Approach A**: GCC 13 cannot auto-vectorize the `liftToSimd` pattern +in any attempted form. Clang 18 vectorizes the unmodified original. -Re-run `fopt-info-vec-missed` to see if the struct-access blocker disappears. - -### Step 2 — Combine with `fmaxf` in `godunovsNormSqrd` +--- -Once the struct-access blocker is cleared, reinstate the `fmaxf` / `float sign` -formulation and check whether the full `normSqGrad` kernel vectorizes. +## 5. Vectorization Results (Approach B, assembly-verified) -### Step 3 — Clang comparison +Compiled with: +```sh +clang++-18 -O3 -march=native -std=c++17 \ + -I/usr/include/c++/13 -I/usr/include/x86_64-linux-gnu/c++/13 \ + -o lift_test lift_test.cpp +``` -Compile the same test with `clang++ -O3 -march=native -Rpass=loop-vectorize` to -determine whether this is a GCC-specific limitation or a fundamental IR issue. -Clang's vectorizer handles struct member accesses differently and may succeed where -GCC fails. +**Clang**: 964 total ymm instructions; 691 inside `runSimdNormSqGrad` + +`normSqGrad>`. Key instructions: `vfmadd*ps`, `vsubps`, `vmulps`, +`vmaxps`, `vminps`, `vblendvps`, `vcmpnltps`. Two separate instantiations confirmed +in the symbol table: `normSqGrad>` and `normSqGrad`. -### Step 4 — Restore `v0 > isoValue` +**GCC**: Correct results but does not vectorize the per-operator loops inside +`Simd` ("more than one data ref in stmt", "return slot optimization" on +`weno5` calls). Same class of struct-access limitation as Approach A. -Once the constant-sign version vectorizes, replace `1.f` with -`(v0 > isoValue) ? 1.f : -1.f` at the call site. This introduces a VCMPPS+BLENDVPS -at the call site but no branching inside the arithmetic, which the vectorizer should -handle as a blend. +All 16 lanes produce correct results vs. the scalar (`T=float`) reference on both +compilers. -### Step 5 — Consider alternative abstraction: `const ValueType*` kernel +--- -`StencilGather.md §4a` already specifies the kernel lambda signature as -`std::array(const ValueType* u)` (raw pointer, not tuple). -If the tuple path proves too resistant to auto-vectorization, the SIMD lift can be -reformulated over flat `float[N][W]` SoA arrays instead. The `liftToSimd` idea -survives — the tuple input/output types would be replaced by flat arrays — but the -scalar lambda signature changes slightly. +## 6. Open Questions / Next Steps + +- **GCC support**: The per-operator loops in `Simd` (e.g., `operator+`) are + simple W-iteration loops over `std::array` members. GCC's "return slot + optimization" diagnostic on `weno5` calls suggests it cannot treat the `Simd` + return values as local registers. Explicit intrinsics (AVX2 `__m256`) in + `Simd` would guarantee GCC vectorization but require architecture-specific + specializations. +- **Benchmarking**: Throughput of the vectorized Clang path vs. scalar not yet + measured on representative VBM data. +- **Integration**: `Simd.h` to be moved to `nanovdb/util/Simd.h`; `weno5`, + `godunovsNormSqrd`, `normSqGrad` to be templated in `nanovdb/math/Stencils.h`. +- **C++26 migration**: Once `std::simd` is available, `nanovdb::util::Simd` + can be replaced with `std::fixed_size_simd` — the kernel source is unchanged. --- @@ -234,12 +227,19 @@ scalar lambda signature changes slightly. | File | Purpose | |------|---------| -| `simd_test/lift_test.cpp` | Self-contained test: `liftToSimd` infrastructure + WENO5 normSqGrad kernel | +| `simd_test/Simd.h` | Minimal `nanovdb::util::Simd` library (prototype; destined for `nanovdb/util/`) | +| `simd_test/lift_test.cpp` | Test: templated `weno5`, `godunovsNormSqrd`, `normSqGrad`; correctness check vs. scalar reference | | `nanovdb/nanovdb/math/Stencils.h` | Original `weno5`, `GodunovsNormSqrd`, `WenoStencil::normSqGrad` | -| `nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilGather.md` | Per-block stencil gather design doc (kernel lambda spec, CPU batch strategy) | +| `nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilGather.md` | Per-block stencil gather design doc | | `nanovdb/nanovdb/tools/VoxelBlockManager.h` | CPU VBM implementation | -Build command: +Build commands: ```sh +# Clang (vectorizes): +clang++-18 -O3 -march=native -std=c++17 \ + -I/usr/include/c++/13 -I/usr/include/x86_64-linux-gnu/c++/13 \ + -o lift_test lift_test.cpp + +# GCC (correct results, does not vectorize): g++ -O3 -march=native -std=c++17 -fopt-info-vec-missed -o lift_test lift_test.cpp ``` diff --git a/simd_test/Simd.h b/simd_test/Simd.h new file mode 100644 index 0000000000..ee995c56f0 --- /dev/null +++ b/simd_test/Simd.h @@ -0,0 +1,153 @@ +#pragma once +#include + +// Minimal SIMD abstraction for NanoVDB stencil kernels. +// +// Designed to be __hostdev__-compatible: on CUDA device code, instantiate +// kernels with T=float (scalar); on CPU, instantiate with T=Simd. +// All arithmetic operators and where()/max() are overloaded for both cases, +// so a single templated kernel compiles correctly for both execution contexts. +// +// Mirrors the C++26 std::simd interface deliberately — migration is a typedef. + +// --------------------------------------------------------------------------- +// Portability: __hostdev__ is a no-op outside CUDA +// --------------------------------------------------------------------------- +#ifndef __CUDACC__ +# define NANOVDB_SIMD_HOSTDEV +#else +# define NANOVDB_SIMD_HOSTDEV __host__ __device__ +#endif + +namespace nanovdb { +namespace util { + +template struct Simd; +template struct SimdMask; + +// --------------------------------------------------------------------------- +// SimdMask: result of a lane-wise comparison +// --------------------------------------------------------------------------- +template +struct SimdMask { + std::array data{}; + + NANOVDB_SIMD_HOSTDEV bool operator[](int i) const { return data[i]; } + NANOVDB_SIMD_HOSTDEV bool& operator[](int i) { return data[i]; } +}; + +// --------------------------------------------------------------------------- +// Simd: W-wide vector of T, backed by std::array +// --------------------------------------------------------------------------- +template +struct Simd { + std::array data{}; + + Simd() = default; + NANOVDB_SIMD_HOSTDEV Simd(T scalar) { data.fill(scalar); } // broadcast + NANOVDB_SIMD_HOSTDEV explicit Simd(const T* p) { + for (int i = 0; i < W; i++) data[i] = p[i]; + } + + NANOVDB_SIMD_HOSTDEV T operator[](int i) const { return data[i]; } + NANOVDB_SIMD_HOSTDEV T& operator[](int i) { return data[i]; } + + NANOVDB_SIMD_HOSTDEV void store(T* p) const { + for (int i = 0; i < W; i++) p[i] = data[i]; + } + + // Unary minus + NANOVDB_SIMD_HOSTDEV Simd operator-() const { + Simd r; + for (int i = 0; i < W; i++) r.data[i] = -data[i]; + return r; + } + + // Lane-wise arithmetic (Simd op Simd) + NANOVDB_SIMD_HOSTDEV Simd operator+(Simd o) const { + Simd r; for (int i = 0; i < W; i++) r.data[i] = data[i] + o.data[i]; return r; + } + NANOVDB_SIMD_HOSTDEV Simd operator-(Simd o) const { + Simd r; for (int i = 0; i < W; i++) r.data[i] = data[i] - o.data[i]; return r; + } + NANOVDB_SIMD_HOSTDEV Simd operator*(Simd o) const { + Simd r; for (int i = 0; i < W; i++) r.data[i] = data[i] * o.data[i]; return r; + } + NANOVDB_SIMD_HOSTDEV Simd operator/(Simd o) const { + Simd r; for (int i = 0; i < W; i++) r.data[i] = data[i] / o.data[i]; return r; + } + + // Lane-wise comparison → SimdMask + NANOVDB_SIMD_HOSTDEV SimdMask operator>(Simd o) const { + SimdMask m; + for (int i = 0; i < W; i++) m.data[i] = data[i] > o.data[i]; + return m; + } +}; + +// --------------------------------------------------------------------------- +// Mixed scalar/Simd arithmetic (enables e.g. 2.f * simd_val) +// Template argument deduction does not use implicit conversions, so these +// explicit overloads are required for scalar op Simd and Simd op scalar. +// --------------------------------------------------------------------------- +template NANOVDB_SIMD_HOSTDEV +Simd operator+(Simd a, T b) { return a + Simd(b); } +template NANOVDB_SIMD_HOSTDEV +Simd operator+(T a, Simd b) { return Simd(a) + b; } + +template NANOVDB_SIMD_HOSTDEV +Simd operator-(Simd a, T b) { return a - Simd(b); } +template NANOVDB_SIMD_HOSTDEV +Simd operator-(T a, Simd b) { return Simd(a) - b; } + +template NANOVDB_SIMD_HOSTDEV +Simd operator*(Simd a, T b) { return a * Simd(b); } +template NANOVDB_SIMD_HOSTDEV +Simd operator*(T a, Simd b) { return Simd(a) * b; } + +template NANOVDB_SIMD_HOSTDEV +Simd operator/(Simd a, T b) { return a / Simd(b); } +template NANOVDB_SIMD_HOSTDEV +Simd operator/(T a, Simd b) { return Simd(a) / b; } + +// --------------------------------------------------------------------------- +// min/max: lane-wise minimum and maximum +// Scalar overloads ensure templated kernels compile for T=float (GPU path) +// --------------------------------------------------------------------------- +template +NANOVDB_SIMD_HOSTDEV T min(T a, T b) { return a < b ? a : b; } + +template +NANOVDB_SIMD_HOSTDEV T max(T a, T b) { return a > b ? a : b; } + +template +NANOVDB_SIMD_HOSTDEV Simd min(Simd a, Simd b) { + Simd r; + for (int i = 0; i < W; i++) r[i] = a[i] < b[i] ? a[i] : b[i]; + return r; +} + +template +NANOVDB_SIMD_HOSTDEV Simd max(Simd a, Simd b) { + Simd r; + for (int i = 0; i < W; i++) r[i] = a[i] > b[i] ? a[i] : b[i]; + return r; +} + +// --------------------------------------------------------------------------- +// where: lane-wise select — returns a where mask is true, b otherwise +// Maps to VBLENDVPS on AVX2; no branching in vectorized code. +// Scalar overload: plain ternary, for T=float (GPU path) +// --------------------------------------------------------------------------- +template +NANOVDB_SIMD_HOSTDEV T where(bool mask, T a, T b) { return mask ? a : b; } + +template +NANOVDB_SIMD_HOSTDEV Simd where(SimdMask mask, Simd a, Simd b) { + Simd r; + for (int i = 0; i < W; i++) r[i] = mask[i] ? a[i] : b[i]; + return r; +} + +} // namespace util +} // namespace nanovdb diff --git a/simd_test/lift_test.cpp b/simd_test/lift_test.cpp index a582f29d7d..a697cd3d2e 100644 --- a/simd_test/lift_test.cpp +++ b/simd_test/lift_test.cpp @@ -1,168 +1,125 @@ -#include -#include -#include +#include "Simd.h" #include #include -#include -// --------------------------------------------------------------------------- -// Type transformation: replace each T in a tuple with std::array -// --------------------------------------------------------------------------- -template struct ToSimdTuple; -template -struct ToSimdTuple, W> { - using type = std::tuple...>; -}; - -// --------------------------------------------------------------------------- -// extractSlice: given a tuple of arrays, return a tuple of the i-th elements -// --------------------------------------------------------------------------- -template -auto extractSlice(const SimdTupleT& t, int i, std::index_sequence) { - return std::make_tuple(std::get(t)[i]...); -} - -// --------------------------------------------------------------------------- -// storeSlice: write a scalar tuple into the i-th slot of a SIMD tuple -// --------------------------------------------------------------------------- -template -void storeSlice(SimdTupleT& t, int i, const ScalarTupleT& s, std::index_sequence) { - ((std::get(t)[i] = std::get(s)), ...); -} +using namespace nanovdb::util; // --------------------------------------------------------------------------- -// liftToSimd: lift a scalar tuple->tuple function to operate on W-wide arrays +// WENO5 upwind interpolation — templated on T +// T = float : scalar, usable as __hostdev__ on GPU +// T = Simd : W-wide vectorized version on CPU // --------------------------------------------------------------------------- -template -auto liftToSimd(ScalarFn f) { - return [f](const auto& simdIn, auto& simdOut) { - constexpr auto inSize = std::tuple_size_v>; - constexpr auto outSize = std::tuple_size_v>; - for (int i = 0; i < W; i++) { - auto scalarIn = extractSlice(simdIn, i, std::make_index_sequence{}); - auto scalarOut = f(scalarIn); - storeSlice(simdOut, i, scalarOut, std::make_index_sequence{}); - } - }; -} - -// --------------------------------------------------------------------------- -// WENO5 upwind interpolation (from Stencils.h) -// --------------------------------------------------------------------------- -inline float weno5(float v1, float v2, float v3, float v4, float v5, float dx2 = 1.f) +template +T weno5(T v1, T v2, T v3, T v4, T v5, float dx2 = 1.f) { - static constexpr float C = 13.f / 12.f; - const float eps = 1.0e-6f * dx2; - const float A1 = 0.1f / ((C*(v1-2*v2+v3)*(v1-2*v2+v3) + 0.25f*(v1-4*v2+3*v3)*(v1-4*v2+3*v3) + eps) * - (C*(v1-2*v2+v3)*(v1-2*v2+v3) + 0.25f*(v1-4*v2+3*v3)*(v1-4*v2+3*v3) + eps)); - const float A2 = 0.6f / ((C*(v2-2*v3+v4)*(v2-2*v3+v4) + 0.25f*(v2-v4)*(v2-v4) + eps) * - (C*(v2-2*v3+v4)*(v2-2*v3+v4) + 0.25f*(v2-v4)*(v2-v4) + eps)); - const float A3 = 0.3f / ((C*(v3-2*v4+v5)*(v3-2*v4+v5) + 0.25f*(3*v3-4*v4+v5)*(3*v3-4*v4+v5) + eps) * - (C*(v3-2*v4+v5)*(v3-2*v4+v5) + 0.25f*(3*v3-4*v4+v5)*(3*v3-4*v4+v5) + eps)); - return (A1*(2*v1 - 7*v2 + 11*v3) + A2*(5*v3 - v2 + 2*v4) + A3*(2*v3 + 5*v4 - v5)) / (6*(A1+A2+A3)); + const float C = 13.f / 12.f; + const T eps = T(1.0e-6f * dx2); + + const T d12 = v1 - 2.f*v2 + v3; + const T d13 = v1 - 4.f*v2 + 3.f*v3; + const T d23 = v2 - 2.f*v3 + v4; + const T d24 = v2 - v4; + const T d34 = v3 - 2.f*v4 + v5; + const T d35 = 3.f*v3 - 4.f*v4 + v5; + + const T w1 = C*d12*d12 + 0.25f*d13*d13 + eps; + const T w2 = C*d23*d23 + 0.25f*d24*d24 + eps; + const T w3 = C*d34*d34 + 0.25f*d35*d35 + eps; + + const T A1 = 0.1f / (w1*w1); + const T A2 = 0.6f / (w2*w2); + const T A3 = 0.3f / (w3*w3); + + return (A1*(2.f*v1 - 7.f*v2 + 11.f*v3) + + A2*(5.f*v3 - v2 + 2.f*v4) + + A3*(2.f*v3 + 5.f*v4 - v5)) / (6.f*(A1+A2+A3)); } // --------------------------------------------------------------------------- -// GodunovsNormSqrd — blend formulation -// -// Computes both the outside and inside squared terms for each axis via -// ternary blend on isOutside. The intent is that each ternary compiles to -// vcmpps + vblendvps rather than a branch, but GCC's vectorizer currently -// still reports "control flow in loop" even when isOutside is a compile-time -// constant. See INVESTIGATION.md for the full vectorization story. +// godunovsNormSqrd — templated on T (value type) and MaskT (comparison result) +// MaskT = bool when T = float (GPU / scalar path) +// MaskT = SimdMask when T = Simd (CPU SIMD path) +// Mirrors GodunovsNormSqrd in nanovdb/math/Stencils.h // --------------------------------------------------------------------------- -inline float godunovsNormSqrd(bool isOutside, - float dP_xm, float dP_xp, - float dP_ym, float dP_yp, - float dP_zm, float dP_zp) +template +T godunovsNormSqrd(MaskT isOutside, + T dP_xm, T dP_xp, + T dP_ym, T dP_yp, + T dP_zm, T dP_zp) { - float xm = isOutside ? std::max( dP_xm, 0.f) * std::max( dP_xm, 0.f) - : std::max(-dP_xm, 0.f) * std::max(-dP_xm, 0.f); - float xp = isOutside ? std::max(-dP_xp, 0.f) * std::max(-dP_xp, 0.f) - : std::max( dP_xp, 0.f) * std::max( dP_xp, 0.f); - float ym = isOutside ? std::max( dP_ym, 0.f) * std::max( dP_ym, 0.f) - : std::max(-dP_ym, 0.f) * std::max(-dP_ym, 0.f); - float yp = isOutside ? std::max(-dP_yp, 0.f) * std::max(-dP_yp, 0.f) - : std::max( dP_yp, 0.f) * std::max( dP_yp, 0.f); - float zm = isOutside ? std::max( dP_zm, 0.f) * std::max( dP_zm, 0.f) - : std::max(-dP_zm, 0.f) * std::max(-dP_zm, 0.f); - float zp = isOutside ? std::max(-dP_zp, 0.f) * std::max(-dP_zp, 0.f) - : std::max( dP_zp, 0.f) * std::max( dP_zp, 0.f); - return std::max(xm, xp) + std::max(ym, yp) + std::max(zm, zp); + const T zero(0.f); + T outside = max(max(dP_xm, zero) * max(dP_xm, zero), + min(dP_xp, zero) * min(dP_xp, zero)) // (dP/dx)^2 + + max(max(dP_ym, zero) * max(dP_ym, zero), + min(dP_yp, zero) * min(dP_yp, zero)) // (dP/dy)^2 + + max(max(dP_zm, zero) * max(dP_zm, zero), + min(dP_zp, zero) * min(dP_zp, zero)); // (dP/dz)^2 + + T inside = max(min(dP_xm, zero) * min(dP_xm, zero), + max(dP_xp, zero) * max(dP_xp, zero)) // (dP/dx)^2 + + max(min(dP_ym, zero) * min(dP_ym, zero), + max(dP_yp, zero) * max(dP_yp, zero)) // (dP/dy)^2 + + max(min(dP_zm, zero) * min(dP_zm, zero), + max(dP_zp, zero) * max(dP_zp, zero)); // (dP/dz)^2 + + return where(isOutside, outside, inside); } // --------------------------------------------------------------------------- -// WenoNormSqGrad scalar lambda -// -// Input tuple indices follow WenoPt::idx: -// 0 = center (0, 0, 0) -// 1, 2, 3 = x-axis (-3,-2,-1) -// 4, 5, 6 = x-axis ( 1, 2, 3) -// 7, 8, 9 = y-axis (-3,-2,-1) -// 10,11,12 = y-axis ( 1, 2, 3) -// 13,14,15 = z-axis (-3,-2,-1) -// 16,17,18 = z-axis ( 1, 2, 3) +// normSqGrad — templated on T +// Input layout matches WenoStencil / WenoPt::idx (19 values): +// v0 = center ( 0, 0, 0) +// v1..v6 = x-axis (-3,-2,-1, +1,+2,+3) +// v7..v12 = y-axis (-3,-2,-1, +1,+2,+3) +// v13..v18 = z-axis (-3,-2,-1, +1,+2,+3) // --------------------------------------------------------------------------- -using WenoIn = std::tuple; -using WenoOut = std::tuple; - -// dx2 = dx^2 (scale for WENO eps), invDx2 = 1/dx^2, isoValue = level set iso -auto makeNormSqGrad(float dx2, float invDx2, float isoValue = 0.f) { - return [=](WenoIn in) -> WenoOut { - const float - v0 = std::get< 0>(in), - v1 = std::get< 1>(in), v2 = std::get< 2>(in), v3 = std::get< 3>(in), - v4 = std::get< 4>(in), v5 = std::get< 5>(in), v6 = std::get< 6>(in), - v7 = std::get< 7>(in), v8 = std::get< 8>(in), v9 = std::get< 9>(in), - v10 = std::get<10>(in), v11 = std::get<11>(in), v12 = std::get<12>(in), - v13 = std::get<13>(in), v14 = std::get<14>(in), v15 = std::get<15>(in), - v16 = std::get<16>(in), v17 = std::get<17>(in), v18 = std::get<18>(in); - - const float - dP_xm = weno5(v2-v1, v3-v2, v0-v3, v4-v0, v5-v4, dx2), - dP_xp = weno5(v6-v5, v5-v4, v4-v0, v0-v3, v3-v2, dx2), - dP_ym = weno5(v8-v7, v9-v8, v0-v9, v10-v0, v11-v10, dx2), - dP_yp = weno5(v12-v11, v11-v10, v10-v0, v0-v9, v9-v8, dx2), - dP_zm = weno5(v14-v13, v15-v14, v0-v15, v16-v0, v17-v16, dx2), - dP_zp = weno5(v18-v17, v17-v16, v16-v0, v0-v15, v15-v14, dx2); - - return { invDx2 * godunovsNormSqrd(v0 > isoValue, - dP_xm, dP_xp, dP_ym, dP_yp, dP_zm, dP_zp) }; - }; +template +T normSqGrad(T v0, + T v1, T v2, T v3, T v4, T v5, T v6, + T v7, T v8, T v9, T v10, T v11, T v12, + T v13, T v14, T v15, T v16, T v17, T v18, + float dx2, float invDx2, float isoValue = 0.f) +{ + const T dP_xm = weno5(v2 -v1, v3 -v2, v0 -v3, v4 -v0, v5 -v4, dx2); + const T dP_xp = weno5(v6 -v5, v5 -v4, v4 -v0, v0 -v3, v3 -v2, dx2); + const T dP_ym = weno5(v8 -v7, v9 -v8, v0 -v9, v10-v0, v11-v10, dx2); + const T dP_yp = weno5(v12-v11, v11-v10, v10-v0, v0 -v9, v9 -v8, dx2); + const T dP_zm = weno5(v14-v13, v15-v14, v0 -v15, v16-v0, v17-v16, dx2); + const T dP_zp = weno5(v18-v17, v17-v16, v16-v0, v0 -v15, v15-v14, dx2); + + return invDx2 * godunovsNormSqrd(v0 > T(isoValue), + dP_xm, dP_xp, + dP_ym, dP_yp, + dP_zm, dP_zp); } // --------------------------------------------------------------------------- -// SIMD wrapper +// SIMD wrapper — noinline to prevent constant-folding in the test // --------------------------------------------------------------------------- constexpr int W = 16; -using WenoSimdIn = typename ToSimdTuple::type; -using WenoSimdOut = typename ToSimdTuple::type; +using FloatSimd = Simd; __attribute__((noinline)) -void runSimdNormSqGrad(const WenoSimdIn& simdIn, WenoSimdOut& simdOut, - float dx2, float invDx2, float isoValue) +FloatSimd runSimdNormSqGrad(const FloatSimd sv[19], + float dx2, float invDx2, float isoValue) { - auto kernel = makeNormSqGrad(dx2, invDx2, isoValue); - auto simdKernel = liftToSimd(kernel); - simdKernel(simdIn, simdOut); + return normSqGrad( + sv[0], sv[1], sv[2], sv[3], sv[4], sv[5], sv[6], + sv[7], sv[8], sv[9], sv[10], sv[11], sv[12], + sv[13], sv[14], sv[15], sv[16], sv[17], sv[18], + dx2, invDx2, isoValue); } // --------------------------------------------------------------------------- -// Reference: scalar normSqGrad directly on a float[19] array +// Reference: scalar path — same kernel instantiated with T=float // --------------------------------------------------------------------------- -float refNormSqGrad(const float* v, float dx2, float invDx2, float isoValue = 0.f) +float refNormSqGrad(const float v[19], float dx2, float invDx2, float isoValue = 0.f) { - const float - dP_xm = weno5(v[2]-v[1], v[3]-v[2], v[0]-v[3], v[4]-v[0], v[5]-v[4], dx2), - dP_xp = weno5(v[6]-v[5], v[5]-v[4], v[4]-v[0], v[0]-v[3], v[3]-v[2], dx2), - dP_ym = weno5(v[8]-v[7], v[9]-v[8], v[0]-v[9], v[10]-v[0], v[11]-v[10], dx2), - dP_yp = weno5(v[12]-v[11], v[11]-v[10], v[10]-v[0], v[0]-v[9], v[9]-v[8], dx2), - dP_zm = weno5(v[14]-v[13], v[15]-v[14], v[0]-v[15], v[16]-v[0], v[17]-v[16], dx2), - dP_zp = weno5(v[18]-v[17], v[17]-v[16], v[16]-v[0], v[0]-v[15], v[15]-v[14], dx2); - return invDx2 * godunovsNormSqrd(v[0] > isoValue, - dP_xm, dP_xp, dP_ym, dP_yp, dP_zm, dP_zp); + return normSqGrad( + v[0], v[1], v[2], v[3], v[4], v[5], v[6], + v[7], v[8], v[9], v[10], v[11], v[12], + v[13], v[14], v[15], v[16], v[17], v[18], + dx2, invDx2, isoValue); } // --------------------------------------------------------------------------- @@ -170,46 +127,29 @@ int main() { const float dx = 0.1f, dx2 = dx*dx, invDx2 = 1.f/(dx*dx); - // Fill 16 lanes with distinct synthetic level-set-like values - WenoSimdIn simdIn{}; + // Storage: SoA layout — inData[n] holds W lane values for stencil position n + float inData[19][W]{}; float refValues[W][19]; - for (int i = 0; i < W; i++) { - // Smooth profile: v[n] ~ sin(n * 0.3 + i * 0.5) + for (int i = 0; i < W; i++) for (int n = 0; n < 19; n++) { - float val = std::sin(n * 0.3f + i * 0.5f); - refValues[i][n] = val; + refValues[i][n] = std::sin(n * 0.3f + i * 0.5f); + inData[n][i] = refValues[i][n]; } - std::get< 0>(simdIn)[i] = refValues[i][ 0]; - std::get< 1>(simdIn)[i] = refValues[i][ 1]; - std::get< 2>(simdIn)[i] = refValues[i][ 2]; - std::get< 3>(simdIn)[i] = refValues[i][ 3]; - std::get< 4>(simdIn)[i] = refValues[i][ 4]; - std::get< 5>(simdIn)[i] = refValues[i][ 5]; - std::get< 6>(simdIn)[i] = refValues[i][ 6]; - std::get< 7>(simdIn)[i] = refValues[i][ 7]; - std::get< 8>(simdIn)[i] = refValues[i][ 8]; - std::get< 9>(simdIn)[i] = refValues[i][ 9]; - std::get<10>(simdIn)[i] = refValues[i][10]; - std::get<11>(simdIn)[i] = refValues[i][11]; - std::get<12>(simdIn)[i] = refValues[i][12]; - std::get<13>(simdIn)[i] = refValues[i][13]; - std::get<14>(simdIn)[i] = refValues[i][14]; - std::get<15>(simdIn)[i] = refValues[i][15]; - std::get<16>(simdIn)[i] = refValues[i][16]; - std::get<17>(simdIn)[i] = refValues[i][17]; - std::get<18>(simdIn)[i] = refValues[i][18]; - } - WenoSimdOut simdOut{}; - runSimdNormSqGrad(simdIn, simdOut, dx2, invDx2, 0.f); + // Load into Simd — each FloatSimd holds one stencil position across all W lanes + FloatSimd sv[19]; + for (int n = 0; n < 19; n++) + sv[n] = FloatSimd(inData[n]); + + FloatSimd result = runSimdNormSqGrad(sv, dx2, invDx2, 0.f); - printf("WenoNormSqGrad (W=%d, dx=%.2f):\n", W, dx); + printf("WenoNormSqGrad full 3-axis (W=%d, dx=%.2f):\n", W, dx); bool allOk = true; for (int i = 0; i < W; i++) { - float ref = refNormSqGrad(refValues[i], dx2, invDx2, 0.f); - float got = std::get<0>(simdOut)[i]; - bool ok = std::abs(got - ref) < 1e-5f * std::abs(ref) + 1e-10f; + float ref = refNormSqGrad(refValues[i], dx2, invDx2, 0.f); + float got = result[i]; + bool ok = std::abs(got - ref) < 1e-5f * std::abs(ref) + 1e-10f; printf(" lane %2d: %12.6f ref: %12.6f %s\n", i, got, ref, ok ? "OK" : "FAIL"); allOk &= ok; } From b45ebad9431e0183e238969662ec888a0321b7df Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Fri, 3 Apr 2026 14:40:46 -0500 Subject: [PATCH 09/60] simd_test/Simd.h: add std::experimental::simd backend for C++26 Auto-detect (Parallelism TS v2) via __has_include and __cpp_lib_experimental_parallel_simd. When available, Simd and SimdMask become thin wrappers around fixed_size_simd / fixed_size_simd_mask, delegating all arithmetic to the standard type. The TS v2 where(mask, v) is a 2-arg masked-assignment proxy; wrap it into the 3-arg select(mask, a, b) form expected by the kernels. Verified with clang++-18 -std=c++26: both paths produce identical assembly (1275 ymm instructions, PASS on all 16 lanes), confirming Clang optimizes through the wrapper completely. Signed-off-by: Efstathios Sifakis Signed-off-by: Efty Sifakis --- simd_test/Simd.h | 181 +++++++++++++++++++++++++++++------------------ 1 file changed, 112 insertions(+), 69 deletions(-) diff --git a/simd_test/Simd.h b/simd_test/Simd.h index ee995c56f0..8c56770fd6 100644 --- a/simd_test/Simd.h +++ b/simd_test/Simd.h @@ -3,12 +3,22 @@ // Minimal SIMD abstraction for NanoVDB stencil kernels. // -// Designed to be __hostdev__-compatible: on CUDA device code, instantiate -// kernels with T=float (scalar); on CPU, instantiate with T=Simd. -// All arithmetic operators and where()/max() are overloaded for both cases, -// so a single templated kernel compiles correctly for both execution contexts. +// Two implementations, selected automatically at compile time: // -// Mirrors the C++26 std::simd interface deliberately — migration is a typedef. +// NANOVDB_USE_STD_SIMD (set when is available): +// Simd and SimdMask are thin wrappers around +// std::experimental::fixed_size_simd / fixed_size_simd_mask. +// All arithmetic delegates to the standard type; the compiler emits +// native vector instructions without relying on the auto-vectorizer. +// +// Default (std::array backend): +// Simd wraps std::array with element-wise operator loops. +// Clang auto-vectorizes these loops; GCC does not. +// +// In both cases the interface is identical, so templated kernels (T=float +// for GPU, T=Simd for CPU) compile unmodified. +// +// Mirrors the C++26 std::simd naming — migration will be a typedef swap. // --------------------------------------------------------------------------- // Portability: __hostdev__ is a no-op outside CUDA @@ -19,51 +29,111 @@ # define NANOVDB_SIMD_HOSTDEV __host__ __device__ #endif +// --------------------------------------------------------------------------- +// Auto-detect std::experimental::simd (Parallelism TS v2) +// --------------------------------------------------------------------------- +#if defined(__has_include) && __has_include() +# include +# ifdef __cpp_lib_experimental_parallel_simd +# define NANOVDB_USE_STD_SIMD 1 +# endif +#endif + namespace nanovdb { namespace util { -template struct Simd; -template struct SimdMask; +// =========================================================================== +// Implementation A: std::experimental::simd wrapper +// =========================================================================== +#ifdef NANOVDB_USE_STD_SIMD + +namespace stdx = std::experimental; -// --------------------------------------------------------------------------- -// SimdMask: result of a lane-wise comparison -// --------------------------------------------------------------------------- template struct SimdMask { - std::array data{}; + stdx::fixed_size_simd_mask inner{}; + + SimdMask() = default; + SimdMask(stdx::fixed_size_simd_mask m) : inner(m) {} + bool operator[](int i) const { return inner[i]; } +}; + +template +struct Simd { + using StdxT = stdx::fixed_size_simd; + StdxT inner{}; + + Simd() = default; + Simd(T scalar) : inner(scalar) {} // broadcast + explicit Simd(const T* p) // load + : inner(p, stdx::element_aligned) {} + Simd(StdxT v) : inner(v) {} // from stdx ops + + T operator[](int i) const { return inner[i]; } + void store(T* p) const { inner.copy_to(p, stdx::element_aligned); } + + Simd operator-() const { return Simd(-inner); } + Simd operator+(Simd o) const { return Simd(inner + o.inner); } + Simd operator-(Simd o) const { return Simd(inner - o.inner); } + Simd operator*(Simd o) const { return Simd(inner * o.inner); } + Simd operator/(Simd o) const { return Simd(inner / o.inner); } + SimdMask operator>(Simd o) const { return SimdMask(inner > o.inner); } +}; + +// Mixed scalar/Simd — stdx handles scalar*StdxT natively +template inline Simd operator+(T a, Simd b) { return Simd(a + b.inner); } +template inline Simd operator+(Simd a, T b) { return Simd(a.inner + b); } +template inline Simd operator-(T a, Simd b) { return Simd(a - b.inner); } +template inline Simd operator-(Simd a, T b) { return Simd(a.inner - b); } +template inline Simd operator*(T a, Simd b) { return Simd(a * b.inner); } +template inline Simd operator*(Simd a, T b) { return Simd(a.inner * b); } +template inline Simd operator/(T a, Simd b) { return Simd(a / b.inner); } +template inline Simd operator/(Simd a, T b) { return Simd(a.inner / b); } + +template +inline Simd min(Simd a, Simd b) { return Simd(stdx::min(a.inner, b.inner)); } +template +inline Simd max(Simd a, Simd b) { return Simd(stdx::max(a.inner, b.inner)); } + +// TS v2 where(mask, v) is a masked assignment proxy, not a 3-arg select. +// Wrap it into the select(mask, a, b) form our kernels expect. +template +inline Simd where(SimdMask mask, Simd a, Simd b) { + auto result = b.inner; + stdx::where(mask.inner, result) = a.inner; + return Simd(result); +} + +// =========================================================================== +// Implementation B: std::array backend (default) +// =========================================================================== +#else + +template +struct SimdMask { + std::array data{}; NANOVDB_SIMD_HOSTDEV bool operator[](int i) const { return data[i]; } NANOVDB_SIMD_HOSTDEV bool& operator[](int i) { return data[i]; } }; -// --------------------------------------------------------------------------- -// Simd: W-wide vector of T, backed by std::array -// --------------------------------------------------------------------------- template struct Simd { std::array data{}; Simd() = default; - NANOVDB_SIMD_HOSTDEV Simd(T scalar) { data.fill(scalar); } // broadcast - NANOVDB_SIMD_HOSTDEV explicit Simd(const T* p) { + NANOVDB_SIMD_HOSTDEV Simd(T scalar) { data.fill(scalar); } // broadcast + NANOVDB_SIMD_HOSTDEV explicit Simd(const T* p) { // load for (int i = 0; i < W; i++) data[i] = p[i]; } - NANOVDB_SIMD_HOSTDEV T operator[](int i) const { return data[i]; } NANOVDB_SIMD_HOSTDEV T& operator[](int i) { return data[i]; } - NANOVDB_SIMD_HOSTDEV void store(T* p) const { for (int i = 0; i < W; i++) p[i] = data[i]; } - - // Unary minus NANOVDB_SIMD_HOSTDEV Simd operator-() const { - Simd r; - for (int i = 0; i < W; i++) r.data[i] = -data[i]; - return r; + Simd r; for (int i = 0; i < W; i++) r.data[i] = -data[i]; return r; } - - // Lane-wise arithmetic (Simd op Simd) NANOVDB_SIMD_HOSTDEV Simd operator+(Simd o) const { Simd r; for (int i = 0; i < W; i++) r.data[i] = data[i] + o.data[i]; return r; } @@ -76,8 +146,6 @@ struct Simd { NANOVDB_SIMD_HOSTDEV Simd operator/(Simd o) const { Simd r; for (int i = 0; i < W; i++) r.data[i] = data[i] / o.data[i]; return r; } - - // Lane-wise comparison → SimdMask NANOVDB_SIMD_HOSTDEV SimdMask operator>(Simd o) const { SimdMask m; for (int i = 0; i < W; i++) m.data[i] = data[i] > o.data[i]; @@ -85,69 +153,44 @@ struct Simd { } }; -// --------------------------------------------------------------------------- -// Mixed scalar/Simd arithmetic (enables e.g. 2.f * simd_val) -// Template argument deduction does not use implicit conversions, so these -// explicit overloads are required for scalar op Simd and Simd op scalar. -// --------------------------------------------------------------------------- -template NANOVDB_SIMD_HOSTDEV -Simd operator+(Simd a, T b) { return a + Simd(b); } template NANOVDB_SIMD_HOSTDEV Simd operator+(T a, Simd b) { return Simd(a) + b; } - template NANOVDB_SIMD_HOSTDEV -Simd operator-(Simd a, T b) { return a - Simd(b); } +Simd operator+(Simd a, T b) { return a + Simd(b); } template NANOVDB_SIMD_HOSTDEV Simd operator-(T a, Simd b) { return Simd(a) - b; } - template NANOVDB_SIMD_HOSTDEV -Simd operator*(Simd a, T b) { return a * Simd(b); } +Simd operator-(Simd a, T b) { return a - Simd(b); } template NANOVDB_SIMD_HOSTDEV Simd operator*(T a, Simd b) { return Simd(a) * b; } - template NANOVDB_SIMD_HOSTDEV -Simd operator/(Simd a, T b) { return a / Simd(b); } +Simd operator*(Simd a, T b) { return a * Simd(b); } template NANOVDB_SIMD_HOSTDEV Simd operator/(T a, Simd b) { return Simd(a) / b; } - -// --------------------------------------------------------------------------- -// min/max: lane-wise minimum and maximum -// Scalar overloads ensure templated kernels compile for T=float (GPU path) -// --------------------------------------------------------------------------- -template -NANOVDB_SIMD_HOSTDEV T min(T a, T b) { return a < b ? a : b; } - -template -NANOVDB_SIMD_HOSTDEV T max(T a, T b) { return a > b ? a : b; } +template NANOVDB_SIMD_HOSTDEV +Simd operator/(Simd a, T b) { return a / Simd(b); } template NANOVDB_SIMD_HOSTDEV Simd min(Simd a, Simd b) { - Simd r; - for (int i = 0; i < W; i++) r[i] = a[i] < b[i] ? a[i] : b[i]; - return r; + Simd r; for (int i = 0; i < W; i++) r[i] = a[i] < b[i] ? a[i] : b[i]; return r; } - template NANOVDB_SIMD_HOSTDEV Simd max(Simd a, Simd b) { - Simd r; - for (int i = 0; i < W; i++) r[i] = a[i] > b[i] ? a[i] : b[i]; - return r; + Simd r; for (int i = 0; i < W; i++) r[i] = a[i] > b[i] ? a[i] : b[i]; return r; } - -// --------------------------------------------------------------------------- -// where: lane-wise select — returns a where mask is true, b otherwise -// Maps to VBLENDVPS on AVX2; no branching in vectorized code. -// Scalar overload: plain ternary, for T=float (GPU path) -// --------------------------------------------------------------------------- -template -NANOVDB_SIMD_HOSTDEV T where(bool mask, T a, T b) { return mask ? a : b; } - template NANOVDB_SIMD_HOSTDEV Simd where(SimdMask mask, Simd a, Simd b) { - Simd r; - for (int i = 0; i < W; i++) r[i] = mask[i] ? a[i] : b[i]; - return r; + Simd r; for (int i = 0; i < W; i++) r[i] = mask[i] ? a[i] : b[i]; return r; } +#endif // NANOVDB_USE_STD_SIMD + +// --------------------------------------------------------------------------- +// Scalar overloads — always present, for T=float (GPU / scalar path) +// --------------------------------------------------------------------------- +template NANOVDB_SIMD_HOSTDEV T min(T a, T b) { return a < b ? a : b; } +template NANOVDB_SIMD_HOSTDEV T max(T a, T b) { return a > b ? a : b; } +template NANOVDB_SIMD_HOSTDEV T where(bool m, T a, T b) { return m ? a : b; } + } // namespace util } // namespace nanovdb From 693fd2b321559beeb99631b578d1c1de0ce64c37 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Fri, 3 Apr 2026 14:42:56 -0500 Subject: [PATCH 10/60] simd_test: update investigation notes with dual-backend Simd.h findings Document the std::experimental::simd backend alongside the std::array default, including the TS v2 where() adaptation, the auto-detection mechanism, and the assembly comparison showing byte-for-byte identical output between the two backends under Clang 18. Update the vectorization results table and open questions accordingly. Signed-off-by: Efstathios Sifakis Signed-off-by: Efty Sifakis --- simd_test/INVESTIGATION.md | 172 +++++++++++++++++++++---------------- 1 file changed, 100 insertions(+), 72 deletions(-) diff --git a/simd_test/INVESTIGATION.md b/simd_test/INVESTIGATION.md index 373180030f..d1c041c004 100644 --- a/simd_test/INVESTIGATION.md +++ b/simd_test/INVESTIGATION.md @@ -48,12 +48,8 @@ any attempted form (see §4). ### Why Superseded -The input/output types are `std::tuple&, ...>` — reference -tuples pointing into existing SoA buffers. While correct, the design has two -limitations: - -1. The scalar kernel is a separate code path from the GPU kernel — it takes a - tuple, not individual arguments, and cannot be templated on `T` directly. +1. The scalar kernel takes a tuple, not individual arguments, and cannot be + templated on `T` directly — it is a separate code path from the GPU kernel. 2. Vectorization relies entirely on the auto-vectorizer seeing through the tuple extraction loop, which GCC cannot do. @@ -63,8 +59,7 @@ limitations: ### Core Idea -Instead of lifting a scalar kernel into SIMD, write the kernel **once** as a -template on its value type `T`: +Write the kernel **once** as a template on its value type `T`: ```cpp template @@ -80,36 +75,20 @@ zero `#ifdef`. ### `where()` — the key primitive -The `bool isOutside ? a : b` ternary cannot be used with a SIMD mask. `where(mask, -a, b)` replaces it: +`bool isOutside ? a : b` cannot be used with a SIMD mask. `where(mask, a, b)` +replaces it: ```cpp -// Scalar overload (T=float): plain ternary +// Scalar (T=float): plain ternary — GPU path template T where(bool mask, T a, T b) { return mask ? a : b; } -// SIMD overload (T=Simd): lane-wise blend → VBLENDVPS, no branch +// SIMD (T=Simd): lane-wise blend → VBLENDVPS, no branch template Simd where(SimdMask mask, Simd a, Simd b); ``` `v0 > T(isoValue)` deduces to `bool` when `T=float` and `SimdMask` when -`T=Simd`, so the call to `where()` resolves correctly in both cases. - -### `nanovdb::util::Simd` - -A minimal header-only library (`simd_test/Simd.h`, destined for `nanovdb/util/`) -providing: - -| Component | Purpose | -|-----------|---------| -| `Simd` | W-wide vector backed by `std::array`; broadcast constructor, `operator[]`, `store()`, arithmetic operators | -| `SimdMask` | Lane-wise boolean result of comparisons | -| `min`, `max` | Lane-wise min/max; scalar overloads for GPU path | -| `where` | Lane-wise blend; scalar overload for GPU path | -| Mixed `T op Simd` / `Simd op T` overloads | Enable `2.f * simd_val` etc. without requiring implicit conversions in template deduction | - -~150 lines total. `__hostdev__`-annotated throughout (macro-guarded for non-CUDA -builds). Mirrors C++26 `std::simd` naming deliberately — migration is a typedef. +`T=Simd`, so the `where()` call resolves correctly in both cases. ### Kernel structure @@ -120,9 +99,9 @@ T godunovsNormSqrd(MaskT isOutside, { const T zero(0.f); T outside = max(max(dP_xm,zero)*max(dP_xm,zero), min(dP_xp,zero)*min(dP_xp,zero)) - + ...; // y, z + + ...; // y, z axes T inside = max(min(dP_xm,zero)*min(dP_xm,zero), max(dP_xp,zero)*max(dP_xp,zero)) - + ...; // y, z + + ...; return where(isOutside, outside, inside); } @@ -137,25 +116,78 @@ T normSqGrad(T v0, T v1, ..., T v18, float dx2, float invDx2, float isoValue) } ``` -This is structurally identical to `WenoStencil::normSqGrad` in `Stencils.h`. +Structurally identical to `WenoStencil::normSqGrad` in `Stencils.h`. ### GPU / CPU call sites ```cpp // GPU: one thread per voxel, scalar instantiation -float result = normSqGrad(v[0], v[1], ..., v[18], dx2, invDx2, iso); +float result = normSqGrad(v[0], ..., v[18], dx2, invDx2, iso); // CPU: one call per batch of W voxels, SIMD instantiation using FloatSimd = nanovdb::util::Simd; -FloatSimd result = normSqGrad(sv[0], sv[1], ..., sv[18], dx2, invDx2, iso); +FloatSimd result = normSqGrad(sv[0], ..., sv[18], dx2, invDx2, iso); ``` NVCC's demand-driven template instantiation ensures `normSqGrad` is -never compiled for device — it is only instantiated in host code. +never compiled for device. + +--- + +## 4. `nanovdb::util::Simd` — two backends + +`simd_test/Simd.h` (destined for `nanovdb/util/`) provides `Simd`, +`SimdMask`, `min`, `max`, and `where` with two interchangeable implementations +selected automatically at compile time. + +### Backend A: `std::experimental::simd` (C++26 / Parallelism TS v2) + +Activated when `` is available and +`__cpp_lib_experimental_parallel_simd` is defined. + +`Simd` and `SimdMask` are thin wrappers around +`std::experimental::fixed_size_simd` and +`std::experimental::fixed_size_simd_mask`. All arithmetic delegates to the +standard types; the compiler emits native vector instructions without relying on the +auto-vectorizer. + +The TS v2 `where(mask, v)` is a 2-arg masked-assignment proxy, not a 3-arg select. +The wrapper adapts it: +```cpp +template +Simd where(SimdMask mask, Simd a, Simd b) { + auto result = b.inner; + stdx::where(mask.inner, result) = a.inner; + return Simd(result); +} +``` + +### Backend B: `std::array` (default, C++17) + +`Simd` wraps `std::array` with element-wise operator loops. +Clang auto-vectorizes these loops; GCC does not (same class of struct-access +limitation as Approach A). `__hostdev__`-annotated throughout for CUDA +compatibility. + +### Assembly comparison (Clang 18, AVX2, `-O3 -march=native`) + +| Standard flag | Backend active | ymm count | Assembly | +|---|---|---|---| +| `-std=c++17` | `std::array` | 1275 | — | +| `-std=c++26` | `std::experimental::simd` | 1275 | **byte-for-byte identical** | + +Clang fully inlines through the `stdx` wrapper — zero overhead. Both paths produce +the same 1275 ymm instructions, all 16 lanes pass. + +### C++26 migration path + +When `std::simd` lands in `` (not yet in Clang 18's libstdc++), replacing the +`std::experimental` wrapper with a `std::simd` alias will be a one-line typedef +change. The kernel source is unchanged. --- -## 4. Vectorization Experiments and Findings (Approach A) +## 5. Vectorization Experiments and Findings (Approach A) Platform: x86-64, AVX2, Ubuntu. GCC 13. Clang 18. Base flags: `-O3 -march=native -std=c++17` @@ -183,63 +215,59 @@ in any attempted form. Clang 18 vectorizes the unmodified original. --- -## 5. Vectorization Results (Approach B, assembly-verified) - -Compiled with: -```sh -clang++-18 -O3 -march=native -std=c++17 \ - -I/usr/include/c++/13 -I/usr/include/x86_64-linux-gnu/c++/13 \ - -o lift_test lift_test.cpp -``` - -**Clang**: 964 total ymm instructions; 691 inside `runSimdNormSqGrad` + -`normSqGrad>`. Key instructions: `vfmadd*ps`, `vsubps`, `vmulps`, -`vmaxps`, `vminps`, `vblendvps`, `vcmpnltps`. Two separate instantiations confirmed -in the symbol table: `normSqGrad>` and `normSqGrad`. +## 6. Vectorization Results (Approach B, assembly-verified) -**GCC**: Correct results but does not vectorize the per-operator loops inside -`Simd` ("more than one data ref in stmt", "return slot optimization" on -`weno5` calls). Same class of struct-access limitation as Approach A. +| Compiler / flags | Backend | ymm in hot fn | Result | +|---|---|---|---| +| clang++-18 `-std=c++17` | `std::array` | 691 / 1275 total | PASS | +| clang++-18 `-std=c++26` | `std::experimental::simd` | 691 / 1275 total | PASS | +| g++ `-std=c++17` | `std::array` | 0 (not vectorized) | PASS (scalar) | -All 16 lanes produce correct results vs. the scalar (`T=float`) reference on both -compilers. +Key instructions in vectorized path: `vfmadd*ps`, `vsubps`, `vmulps`, `vmaxps`, +`vminps`, `vblendvps`, `vcmpnltps`. Two separate instantiations in the symbol +table: `normSqGrad>` (SIMD) and `normSqGrad` (scalar ref). --- -## 6. Open Questions / Next Steps - -- **GCC support**: The per-operator loops in `Simd` (e.g., `operator+`) are - simple W-iteration loops over `std::array` members. GCC's "return slot - optimization" diagnostic on `weno5` calls suggests it cannot treat the `Simd` - return values as local registers. Explicit intrinsics (AVX2 `__m256`) in - `Simd` would guarantee GCC vectorization but require architecture-specific - specializations. -- **Benchmarking**: Throughput of the vectorized Clang path vs. scalar not yet - measured on representative VBM data. -- **Integration**: `Simd.h` to be moved to `nanovdb/util/Simd.h`; `weno5`, - `godunovsNormSqrd`, `normSqGrad` to be templated in `nanovdb/math/Stencils.h`. -- **C++26 migration**: Once `std::simd` is available, `nanovdb::util::Simd` - can be replaced with `std::fixed_size_simd` — the kernel source is unchanged. +## 7. Open Questions / Next Steps + +- **GCC support**: The per-operator loops in `Simd` (Backend B) are simple + W-iteration loops over `std::array`. GCC's "return slot optimization" on `weno5` + calls prevents vectorization. Explicit AVX2 intrinsics in a GCC-specific + `Simd` specialization would guarantee it, at the cost of + architecture-specific code. +- **Benchmarking**: Throughput of the vectorized path vs. scalar not yet measured on + representative VBM data. +- **Integration**: Move `Simd.h` to `nanovdb/util/Simd.h`; template `weno5`, + `godunovsNormSqrd`, `normSqGrad` in `nanovdb/math/Stencils.h`. +- **`` header**: Clang 18 provides `` but not ``. + Once `` is available, the detection guard can be simplified to + `#if __cpp_lib_simd`. --- -## 7. File Reference +## 8. File Reference | File | Purpose | |------|---------| -| `simd_test/Simd.h` | Minimal `nanovdb::util::Simd` library (prototype; destined for `nanovdb/util/`) | -| `simd_test/lift_test.cpp` | Test: templated `weno5`, `godunovsNormSqrd`, `normSqGrad`; correctness check vs. scalar reference | +| `simd_test/Simd.h` | `nanovdb::util::Simd` — two backends, auto-detected (prototype for `nanovdb/util/`) | +| `simd_test/lift_test.cpp` | Test: templated `weno5`, `godunovsNormSqrd`, `normSqGrad`; correctness vs. scalar reference | | `nanovdb/nanovdb/math/Stencils.h` | Original `weno5`, `GodunovsNormSqrd`, `WenoStencil::normSqGrad` | | `nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilGather.md` | Per-block stencil gather design doc | | `nanovdb/nanovdb/tools/VoxelBlockManager.h` | CPU VBM implementation | Build commands: ```sh -# Clang (vectorizes): +# Clang, std::array backend (C++17): clang++-18 -O3 -march=native -std=c++17 \ -I/usr/include/c++/13 -I/usr/include/x86_64-linux-gnu/c++/13 \ -o lift_test lift_test.cpp +# Clang, std::experimental::simd backend (C++26) — identical assembly: +clang++-18 -O3 -march=native -std=c++26 \ + -I/usr/include/c++/13 -I/usr/include/x86_64-linux-gnu/c++/13 \ + -o lift_test lift_test.cpp + # GCC (correct results, does not vectorize): g++ -O3 -march=native -std=c++17 -fopt-info-vec-missed -o lift_test lift_test.cpp ``` From c4d163153eacbe64ddfc7c5771ddfd0e1ad6bbfd Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Fri, 3 Apr 2026 18:19:11 -0500 Subject: [PATCH 11/60] simd_test: Generic-T kernel hierarchy + Simd.h refinements MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit StencilKernel.h — new prototype header: - BaseStencilKernel: owns mValues[], mDx2, mInvDx2; no grid coupling - WenoStencilKernel: derives from above, provides normSqGrad() - WENO5 and GodunovsNormSqrd: free functions mirroring Stencils.h - T=float for GPU scalar path, T=Simd for CPU batch path lift_test.cpp — rewritten to use WenoStencilKernel directly: - SIMD and scalar reference paths both instantiate the same class - dx passed to constructor; mValues populated via operator[] Simd.h — refinements: - Simd and SimdMask in Backend A are now pure type aliases for stdx::fixed_size_simd / fixed_size_simd_mask (no wrapper struct) - element_aligned_tag / element_aligned: portable load/store tag, always present; aliases stdx::element_aligned_tag in Backend A, dummy struct in B - Backend B load constructor and store() accept element_aligned_tag (defaulted) - NANOVDB_NO_STD_SIMD opt-out flag to force Backend B INVESTIGATION.md — updated: - Approach B section updated to reflect class hierarchy instead of free functions - Backend B GCC note: the struct-access failure was specific to Approach A's liftToSimd outer-lane loop; Backend B's fixed-count operator loops do vectorize on GCC when used with the Generic-T class hierarchy - New ymm tables for both backends under GCC (Backend A: 1267 total, Backend B: 619 total); both pass correctness Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Efty Sifakis --- simd_test/INVESTIGATION.md | 190 ++++++++++++++++++++----------------- simd_test/Simd.h | 77 ++++++--------- simd_test/StencilKernel.h | 165 ++++++++++++++++++++++++++++++++ simd_test/lift_test.cpp | 123 +++--------------------- 4 files changed, 312 insertions(+), 243 deletions(-) create mode 100644 simd_test/StencilKernel.h diff --git a/simd_test/INVESTIGATION.md b/simd_test/INVESTIGATION.md index d1c041c004..9be7f4bd22 100644 --- a/simd_test/INVESTIGATION.md +++ b/simd_test/INVESTIGATION.md @@ -44,7 +44,7 @@ auto liftToSimd(ScalarFn f) { Clang 18 vectorizes the unmodified kernel (with `std::max` and `bool isOutside`) producing a full ymm path with a runtime alias check. GCC 13 does not vectorize in -any attempted form (see §4). +any attempted form (see §5). ### Why Superseded @@ -61,11 +61,6 @@ any attempted form (see §4). Write the kernel **once** as a template on its value type `T`: -```cpp -template -T normSqGrad(T v0, T v1, ..., T v18, float dx2, float invDx2, float isoValue); -``` - - `T = float` → scalar path, `__hostdev__`-compatible, used on GPU per-thread - `T = Simd` → W-wide SIMD path, used on CPU per-batch @@ -90,47 +85,46 @@ Simd where(SimdMask mask, Simd a, Simd b); `v0 > T(isoValue)` deduces to `bool` when `T=float` and `SimdMask` when `T=Simd`, so the `where()` call resolves correctly in both cases. -### Kernel structure +### Class hierarchy -```cpp -template -T godunovsNormSqrd(MaskT isOutside, - T dP_xm, T dP_xp, T dP_ym, T dP_yp, T dP_zm, T dP_zp) -{ - const T zero(0.f); - T outside = max(max(dP_xm,zero)*max(dP_xm,zero), min(dP_xp,zero)*min(dP_xp,zero)) - + ...; // y, z axes - T inside = max(min(dP_xm,zero)*min(dP_xm,zero), max(dP_xp,zero)*max(dP_xp,zero)) - + ...; - return where(isOutside, outside, inside); -} +`WENO5` and `GodunovsNormSqrd` are free functions in `StencilKernel.h`, +mirroring their counterparts in `Stencils.h`. The stencil data and compute methods +live in a two-level class hierarchy: -template -T normSqGrad(T v0, T v1, ..., T v18, float dx2, float invDx2, float isoValue) -{ - const T dP_xm = weno5(...), dP_xp = weno5(...); - const T dP_ym = weno5(...), dP_yp = weno5(...); - const T dP_zm = weno5(...), dP_zp = weno5(...); - return invDx2 * godunovsNormSqrd(v0 > T(isoValue), - dP_xm, dP_xp, dP_ym, dP_yp, dP_zm, dP_zp); -} +``` +BaseStencilKernel mValues[SIZE], mDx2, mInvDx2 — pure data + | +WenoStencilKernel normSqGrad(), ... — pure compute ``` -Structurally identical to `WenoStencil::normSqGrad` in `Stencils.h`. +No grid coupling, no accessor, no `moveTo()`. The VBM gather populates `mValues` +directly; `normSqGrad()` is then called on the populated kernel object. ### GPU / CPU call sites ```cpp -// GPU: one thread per voxel, scalar instantiation -float result = normSqGrad(v[0], ..., v[18], dx2, invDx2, iso); - -// CPU: one call per batch of W voxels, SIMD instantiation -using FloatSimd = nanovdb::util::Simd; -FloatSimd result = normSqGrad(sv[0], ..., sv[18], dx2, invDx2, iso); +// GPU: one thread, scalar — fill from per-thread stencil gather +WenoStencilKernel sk(dx); +for (int n = 0; n < 19; n++) sk[n] = gathered_scalar_values[n]; +float result = sk.normSqGrad(isoValue); + +// CPU: one batch, SIMD — fill from VBM batch gather +WenoStencilKernel> sk(dx); +for (int n = 0; n < 19; n++) sk[n] = gathered_simd_values[n]; +Simd result = sk.normSqGrad(isoValue); ``` -NVCC's demand-driven template instantiation ensures `normSqGrad` is -never compiled for device. +### Relationship to legacy WenoStencil + +The existing `BaseStencil` / `WenoStencil` hierarchy in +`Stencils.h` couples data storage to a grid accessor and a `moveTo()` cursor — a +sequential, single-threaded API incompatible with VBM batch processing. The kernel +hierarchy is designed as its eventual replacement. During transition, the legacy +classes can simply derive from the kernel classes to inherit the compute methods +without disruption. + +NVCC's demand-driven template instantiation ensures `WenoStencilKernel>` +is never compiled for device. --- @@ -138,52 +132,57 @@ never compiled for device. `simd_test/Simd.h` (destined for `nanovdb/util/`) provides `Simd`, `SimdMask`, `min`, `max`, and `where` with two interchangeable implementations -selected automatically at compile time. +selected automatically at compile time. Suppress Backend A with +`-DNANOVDB_NO_STD_SIMD` to force the fallback. ### Backend A: `std::experimental::simd` (C++26 / Parallelism TS v2) -Activated when `` is available and -`__cpp_lib_experimental_parallel_simd` is defined. +Activated when `` is available, +`__cpp_lib_experimental_parallel_simd` is defined, and `NANOVDB_NO_STD_SIMD` is not +set. -`Simd` and `SimdMask` are thin wrappers around +`Simd` and `SimdMask` are **pure type aliases** for `std::experimental::fixed_size_simd` and `std::experimental::fixed_size_simd_mask`. All arithmetic delegates to the standard types; the compiler emits native vector instructions without relying on the auto-vectorizer. The TS v2 `where(mask, v)` is a 2-arg masked-assignment proxy, not a 3-arg select. -The wrapper adapts it: +A thin free function adapts it: ```cpp template Simd where(SimdMask mask, Simd a, Simd b) { - auto result = b.inner; - stdx::where(mask.inner, result) = a.inner; - return Simd(result); + auto result = b; + stdx::where(mask, result) = a; + return result; } ``` ### Backend B: `std::array` (default, C++17) `Simd` wraps `std::array` with element-wise operator loops. -Clang auto-vectorizes these loops; GCC does not (same class of struct-access -limitation as Approach A). `__hostdev__`-annotated throughout for CUDA -compatibility. +`__hostdev__`-annotated throughout for CUDA compatibility. -### Assembly comparison (Clang 18, AVX2, `-O3 -march=native`) +**GCC vectorization note**: GCC's failure to auto-vectorize in §5 was specific to +Approach A's outer-lane loop pattern, where GCC could not see through `std::tuple` +struct indirection in GIMPLE. Backend B's element-wise operator loops (e.g. +`for (int i = 0; i < W; i++) r[i] = a[i] + b[i]`) are a completely different target +— fixed-count, no struct indirection — and GCC does auto-vectorize them when used +with the Generic-T kernel class hierarchy (see §6). -| Standard flag | Backend active | ymm count | Assembly | -|---|---|---|---| -| `-std=c++17` | `std::array` | 1275 | — | -| `-std=c++26` | `std::experimental::simd` | 1275 | **byte-for-byte identical** | +### element_aligned_tag — portable load/store descriptor -Clang fully inlines through the `stdx` wrapper — zero overhead. Both paths produce -the same 1275 ymm instructions, all 16 lanes pass. +`nanovdb::util::element_aligned_tag` and `nanovdb::util::element_aligned` are always +present. In Backend A they alias `stdx::element_aligned_tag` (same type the stdx +constructors expect); in Backend B they are a standalone dummy struct (ignored). +This makes the load constructor `Simd(const T*, element_aligned)` portable across +both backends and forward-compatible with `std::simd`. ### C++26 migration path -When `std::simd` lands in `` (not yet in Clang 18's libstdc++), replacing the -`std::experimental` wrapper with a `std::simd` alias will be a one-line typedef -change. The kernel source is unchanged. +When `std::simd` lands in ``, migration is a one-line change: replace the +`stdx` detection block with `#if __cpp_lib_simd` and `std::experimental` with `std`. +The kernel source, `element_aligned_tag`, and all call sites are unchanged. --- @@ -211,38 +210,55 @@ Base flags: `-O3 -march=native -std=c++17` | 10 | Flat `float[N][W]` arrays | No (gather stride) | n/a | **Conclusion for Approach A**: GCC 13 cannot auto-vectorize the `liftToSimd` pattern -in any attempted form. Clang 18 vectorizes the unmodified original. +in any attempted form. The root cause is GCC's inability to see through `std::tuple`'s +recursive-inheritance struct layout in GIMPLE — not a limitation of Backend B per se. --- ## 6. Vectorization Results (Approach B, assembly-verified) -| Compiler / flags | Backend | ymm in hot fn | Result | -|---|---|---|---| -| clang++-18 `-std=c++17` | `std::array` | 691 / 1275 total | PASS | -| clang++-18 `-std=c++26` | `std::experimental::simd` | 691 / 1275 total | PASS | -| g++ `-std=c++17` | `std::array` | 0 (not vectorized) | PASS (scalar) | +GCC 13, AVX2, `-O3 -march=native -std=c++17`. ymm counts per function (assembly-inspected). -Key instructions in vectorized path: `vfmadd*ps`, `vsubps`, `vmulps`, `vmaxps`, -`vminps`, `vblendvps`, `vcmpnltps`. Two separate instantiations in the symbol -table: `normSqGrad>` (SIMD) and `normSqGrad` (scalar ref). +### Backend A (`std::experimental::simd`, auto-detected) + +| Function | ymm instructions | +|---|---| +| `WenoStencilKernel::normSqGrad` | 945 (WENO5 inlined ×6) | +| `GodunovsNormSqrd` | 289 (out-of-line) | +| `min` / `max` | 10 each | +| `runSimdNormSqGrad` (test wrapper) | 0 (call shell only) | +| **Total** | **1267** | + +### Backend B (`std::array`, forced with `-DNANOVDB_NO_STD_SIMD`) + +| Function | ymm instructions | +|---|---| +| `WenoStencilKernel::normSqGrad` | 365 | +| `WENO5` | 137 (out-of-line) | +| `GodunovsNormSqrd` | 117 (out-of-line) | +| **Total** | **619** | + +Both backends pass all 16 lanes. Backend B vectorizes via GCC's auto-vectorizer on +the fixed-count element-wise operator loops — the struct-access limitation from +Approach A does not apply here. + +Key instructions in both paths: `vfmadd*ps`, `vsubps`, `vmulps`, `vmaxps`, +`vminps`, `vblendvps`, `vcmpnltps`. --- ## 7. Open Questions / Next Steps -- **GCC support**: The per-operator loops in `Simd` (Backend B) are simple - W-iteration loops over `std::array`. GCC's "return slot optimization" on `weno5` - calls prevents vectorization. Explicit AVX2 intrinsics in a GCC-specific - `Simd` specialization would guarantee it, at the cost of - architecture-specific code. - **Benchmarking**: Throughput of the vectorized path vs. scalar not yet measured on representative VBM data. -- **Integration**: Move `Simd.h` to `nanovdb/util/Simd.h`; template `weno5`, - `godunovsNormSqrd`, `normSqGrad` in `nanovdb/math/Stencils.h`. +- **Integration**: Move `Simd.h` to `nanovdb/util/Simd.h`; move `StencilKernel.h` + to `nanovdb/math/`; have legacy `WenoStencil` derive from `WenoStencilKernel` + during transition, then retire it. - **`` header**: Clang 18 provides `` but not ``. - Once `` is available, the detection guard can be simplified to - `#if __cpp_lib_simd`. + Once `` is available, the detection guard simplifies to `#if __cpp_lib_simd`. +- **Clang assembly verification**: Clang not yet installed on this machine. Previous + results (691 ymm flat in hot function, free-function version) predate the + class-based refactor; re-verification pending. --- @@ -251,23 +267,27 @@ table: `normSqGrad>` (SIMD) and `normSqGrad` (scalar ref). | File | Purpose | |------|---------| | `simd_test/Simd.h` | `nanovdb::util::Simd` — two backends, auto-detected (prototype for `nanovdb/util/`) | -| `simd_test/lift_test.cpp` | Test: templated `weno5`, `godunovsNormSqrd`, `normSqGrad`; correctness vs. scalar reference | -| `nanovdb/nanovdb/math/Stencils.h` | Original `weno5`, `GodunovsNormSqrd`, `WenoStencil::normSqGrad` | +| `simd_test/StencilKernel.h` | `BaseStencilKernel`, `WenoStencilKernel`, `WENO5`, `GodunovsNormSqrd` (prototype for `nanovdb/math/`) | +| `simd_test/lift_test.cpp` | Correctness test: SIMD vs scalar reference via `WenoStencilKernel` | +| `nanovdb/nanovdb/math/Stencils.h` | Original scalar `WENO5`, `GodunovsNormSqrd`, `WenoStencil::normSqGrad` | | `nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilGather.md` | Per-block stencil gather design doc | | `nanovdb/nanovdb/tools/VoxelBlockManager.h` | CPU VBM implementation | Build commands: ```sh -# Clang, std::array backend (C++17): -clang++-18 -O3 -march=native -std=c++17 \ - -I/usr/include/c++/13 -I/usr/include/x86_64-linux-gnu/c++/13 \ - -o lift_test lift_test.cpp +# GCC, Backend A (std::experimental::simd, auto-detected): +g++ -O3 -march=native -std=c++17 -o lift_test lift_test.cpp + +# GCC, Backend B (std::array, forced): +g++ -O3 -march=native -std=c++17 -DNANOVDB_NO_STD_SIMD -o lift_test lift_test.cpp -# Clang, std::experimental::simd backend (C++26) — identical assembly: +# Clang, Backend A (std::experimental::simd, C++26): clang++-18 -O3 -march=native -std=c++26 \ -I/usr/include/c++/13 -I/usr/include/x86_64-linux-gnu/c++/13 \ -o lift_test lift_test.cpp -# GCC (correct results, does not vectorize): -g++ -O3 -march=native -std=c++17 -fopt-info-vec-missed -o lift_test lift_test.cpp +# Clang, Backend B (std::array, C++17 or forced): +clang++-18 -O3 -march=native -std=c++17 \ + -I/usr/include/c++/13 -I/usr/include/x86_64-linux-gnu/c++/13 \ + -o lift_test lift_test.cpp ``` diff --git a/simd_test/Simd.h b/simd_test/Simd.h index 8c56770fd6..9470ab9b7c 100644 --- a/simd_test/Simd.h +++ b/simd_test/Simd.h @@ -6,7 +6,7 @@ // Two implementations, selected automatically at compile time: // // NANOVDB_USE_STD_SIMD (set when is available): -// Simd and SimdMask are thin wrappers around +// Simd and SimdMask are pure type aliases for // std::experimental::fixed_size_simd / fixed_size_simd_mask. // All arithmetic delegates to the standard type; the compiler emits // native vector instructions without relying on the auto-vectorizer. @@ -32,7 +32,7 @@ // --------------------------------------------------------------------------- // Auto-detect std::experimental::simd (Parallelism TS v2) // --------------------------------------------------------------------------- -#if defined(__has_include) && __has_include() +#if !defined(NANOVDB_NO_STD_SIMD) && defined(__has_include) && __has_include() # include # ifdef __cpp_lib_experimental_parallel_simd # define NANOVDB_USE_STD_SIMD 1 @@ -42,67 +42,44 @@ namespace nanovdb { namespace util { +// --------------------------------------------------------------------------- +// element_aligned_tag — portable load/store alignment descriptor. +// In the stdx backend this is an alias for stdx::element_aligned_tag so that +// nanovdb::util::element_aligned is the same token stdx constructors expect. +// In the std::array backend it is a standalone dummy struct (ignored). +// --------------------------------------------------------------------------- +#ifdef NANOVDB_USE_STD_SIMD +namespace stdx = std::experimental; +using element_aligned_tag = stdx::element_aligned_tag; +#else +struct element_aligned_tag {}; +#endif +inline constexpr element_aligned_tag element_aligned{}; + // =========================================================================== -// Implementation A: std::experimental::simd wrapper +// Implementation A: std::experimental::simd — pure type aliases // =========================================================================== #ifdef NANOVDB_USE_STD_SIMD -namespace stdx = std::experimental; - template -struct SimdMask { - stdx::fixed_size_simd_mask inner{}; - - SimdMask() = default; - SimdMask(stdx::fixed_size_simd_mask m) : inner(m) {} - bool operator[](int i) const { return inner[i]; } -}; +using SimdMask = stdx::fixed_size_simd_mask; template -struct Simd { - using StdxT = stdx::fixed_size_simd; - StdxT inner{}; - - Simd() = default; - Simd(T scalar) : inner(scalar) {} // broadcast - explicit Simd(const T* p) // load - : inner(p, stdx::element_aligned) {} - Simd(StdxT v) : inner(v) {} // from stdx ops - - T operator[](int i) const { return inner[i]; } - void store(T* p) const { inner.copy_to(p, stdx::element_aligned); } - - Simd operator-() const { return Simd(-inner); } - Simd operator+(Simd o) const { return Simd(inner + o.inner); } - Simd operator-(Simd o) const { return Simd(inner - o.inner); } - Simd operator*(Simd o) const { return Simd(inner * o.inner); } - Simd operator/(Simd o) const { return Simd(inner / o.inner); } - SimdMask operator>(Simd o) const { return SimdMask(inner > o.inner); } -}; - -// Mixed scalar/Simd — stdx handles scalar*StdxT natively -template inline Simd operator+(T a, Simd b) { return Simd(a + b.inner); } -template inline Simd operator+(Simd a, T b) { return Simd(a.inner + b); } -template inline Simd operator-(T a, Simd b) { return Simd(a - b.inner); } -template inline Simd operator-(Simd a, T b) { return Simd(a.inner - b); } -template inline Simd operator*(T a, Simd b) { return Simd(a * b.inner); } -template inline Simd operator*(Simd a, T b) { return Simd(a.inner * b); } -template inline Simd operator/(T a, Simd b) { return Simd(a / b.inner); } -template inline Simd operator/(Simd a, T b) { return Simd(a.inner / b); } +using Simd = stdx::fixed_size_simd; template -inline Simd min(Simd a, Simd b) { return Simd(stdx::min(a.inner, b.inner)); } +inline Simd min(Simd a, Simd b) { return stdx::min(a, b); } template -inline Simd max(Simd a, Simd b) { return Simd(stdx::max(a.inner, b.inner)); } +inline Simd max(Simd a, Simd b) { return stdx::max(a, b); } // TS v2 where(mask, v) is a masked assignment proxy, not a 3-arg select. // Wrap it into the select(mask, a, b) form our kernels expect. template inline Simd where(SimdMask mask, Simd a, Simd b) { - auto result = b.inner; - stdx::where(mask.inner, result) = a.inner; - return Simd(result); + auto result = b; + stdx::where(mask, result) = a; + return result; } // =========================================================================== @@ -122,13 +99,13 @@ struct Simd { std::array data{}; Simd() = default; - NANOVDB_SIMD_HOSTDEV Simd(T scalar) { data.fill(scalar); } // broadcast - NANOVDB_SIMD_HOSTDEV explicit Simd(const T* p) { // load + NANOVDB_SIMD_HOSTDEV Simd(T scalar) { data.fill(scalar); } // broadcast + NANOVDB_SIMD_HOSTDEV explicit Simd(const T* p, element_aligned_tag = {}) { // load for (int i = 0; i < W; i++) data[i] = p[i]; } NANOVDB_SIMD_HOSTDEV T operator[](int i) const { return data[i]; } NANOVDB_SIMD_HOSTDEV T& operator[](int i) { return data[i]; } - NANOVDB_SIMD_HOSTDEV void store(T* p) const { + NANOVDB_SIMD_HOSTDEV void store(T* p, element_aligned_tag = {}) const { // store for (int i = 0; i < W; i++) p[i] = data[i]; } NANOVDB_SIMD_HOSTDEV Simd operator-() const { diff --git a/simd_test/StencilKernel.h b/simd_test/StencilKernel.h new file mode 100644 index 0000000000..f67f7f940a --- /dev/null +++ b/simd_test/StencilKernel.h @@ -0,0 +1,165 @@ +#pragma once +#include "Simd.h" + +// Portable __hostdev__ annotation — no-op outside CUDA, matching NanoVDB convention. +#ifndef __CUDACC__ +# ifndef __hostdev__ +# define __hostdev__ +# endif +#endif + +// --------------------------------------------------------------------------- +// Prototype of the kernel-only stencil hierarchy for NanoVDB. +// +// Defines BaseStencilKernel and WenoStencilKernel, where T is: +// float — scalar, __hostdev__-compatible, GPU per-thread path +// Simd — W-wide SIMD, CPU per-batch path +// +// These are pure data + compute classes with no grid coupling. They are +// intended to replace the compute portions of BaseStencil / WenoStencil in +// nanovdb/math/Stencils.h, with the legacy accessor-based classes deriving +// from these to retain backward compatibility during transition. +// +// Free functions WENO5 and GodunovsNormSqrd mirror their counterparts in +// Stencils.h, templatized on T so they work for both scalar and SIMD. +// --------------------------------------------------------------------------- + +namespace nanovdb { +namespace math { + +using namespace nanovdb::util; // min, max, where, Simd, SimdMask + +// --------------------------------------------------------------------------- +// WENO5 — fifth-order upwind interpolation, templated on T. +// Mirrors WENO5 in Stencils.h; here RealT == T throughout. +// --------------------------------------------------------------------------- +template +__hostdev__ inline T WENO5(T v1, T v2, T v3, T v4, T v5, float scale2 = 1.f) +{ + const float C = 13.f / 12.f; + const T eps = T(1.0e-6f * scale2); + + const T d12 = v1 - 2.f*v2 + v3; + const T d13 = v1 - 4.f*v2 + 3.f*v3; + const T d23 = v2 - 2.f*v3 + v4; + const T d24 = v2 - v4; + const T d34 = v3 - 2.f*v4 + v5; + const T d35 = 3.f*v3 - 4.f*v4 + v5; + + const T w1 = C*d12*d12 + 0.25f*d13*d13 + eps; + const T w2 = C*d23*d23 + 0.25f*d24*d24 + eps; + const T w3 = C*d34*d34 + 0.25f*d35*d35 + eps; + + const T A1 = 0.1f / (w1*w1); + const T A2 = 0.6f / (w2*w2); + const T A3 = 0.3f / (w3*w3); + + return (A1*(2.f*v1 - 7.f*v2 + 11.f*v3) + + A2*(5.f*v3 - v2 + 2.f*v4) + + A3*(2.f*v3 + 5.f*v4 - v5)) / (6.f*(A1+A2+A3)); +} + +// --------------------------------------------------------------------------- +// GodunovsNormSqrd — templated on T (value type) and MaskT (mask type). +// Mirrors GodunovsNormSqrd in Stencils.h. +// The if/else branch in the original is replaced by unconditionally computing +// both the outside and inside terms and blending via where(), so the SIMD +// path produces a lane-wise select with no control flow divergence. +// --------------------------------------------------------------------------- +template +__hostdev__ inline T GodunovsNormSqrd(MaskT isOutside, + T dP_xm, T dP_xp, + T dP_ym, T dP_yp, + T dP_zm, T dP_zp) +{ + const T zero(0.f); + T outside = max(max(dP_xm, zero) * max(dP_xm, zero), + min(dP_xp, zero) * min(dP_xp, zero)) // (dP/dx)^2 + + max(max(dP_ym, zero) * max(dP_ym, zero), + min(dP_yp, zero) * min(dP_yp, zero)) // (dP/dy)^2 + + max(max(dP_zm, zero) * max(dP_zm, zero), + min(dP_zp, zero) * min(dP_zp, zero)); // (dP/dz)^2 + + T inside = max(min(dP_xm, zero) * min(dP_xm, zero), + max(dP_xp, zero) * max(dP_xp, zero)) // (dP/dx)^2 + + max(min(dP_ym, zero) * min(dP_ym, zero), + max(dP_yp, zero) * max(dP_yp, zero)) // (dP/dy)^2 + + max(min(dP_zm, zero) * min(dP_zm, zero), + max(dP_zp, zero) * max(dP_zp, zero)); // (dP/dz)^2 + + return where(isOutside, outside, inside); +} + +// --------------------------------------------------------------------------- +// BaseStencilKernel +// +// Owns mValues[SIZE] and the grid spacing parameters mDx2 / mInvDx2. +// No grid accessor, no moveTo — pure data container for stencil compute. +// --------------------------------------------------------------------------- +template +class BaseStencilKernel +{ +protected: + T mValues[SIZE]{}; + float mDx2{1.f}, mInvDx2{1.f}; + +public: + __hostdev__ BaseStencilKernel() = default; + __hostdev__ explicit BaseStencilKernel(float dx) + : mDx2(dx * dx), mInvDx2(1.f / (dx * dx)) {} + + __hostdev__ T& operator[](int n) { return mValues[n]; } + __hostdev__ const T& operator[](int n) const { return mValues[n]; } + + __hostdev__ static constexpr int size() { return SIZE; } +}; + +// --------------------------------------------------------------------------- +// WenoStencilKernel +// +// Derives from BaseStencilKernel and provides normSqGrad() and +// related compute methods. Mirrors the compute interface of WenoStencil in +// nanovdb/math/Stencils.h. +// +// mValues layout (matching WenoPt::idx): +// [0] = center ( 0, 0, 0) +// [1]..[6] = x-axis (-3,-2,-1, +1,+2,+3) +// [7]..[12] = y-axis (-3,-2,-1, +1,+2,+3) +// [13]..[18] = z-axis (-3,-2,-1, +1,+2,+3) +// --------------------------------------------------------------------------- +template +class WenoStencilKernel : public BaseStencilKernel +{ + using Base = BaseStencilKernel; + +protected: + using Base::mValues; + using Base::mDx2; + using Base::mInvDx2; + +public: + using Base::Base; + + /// @brief Return the norm-squared of the WENO upwind gradient at the + /// buffered stencil location, using Godunov's scheme. + /// Matches WenoStencil::normSqGrad() in Stencils.h. + __hostdev__ inline T normSqGrad(float isoValue = 0.f) const + { + const T* v = mValues; + const T + dP_xm = WENO5(v[ 2]-v[ 1], v[ 3]-v[ 2], v[ 0]-v[ 3], v[ 4]-v[ 0], v[ 5]-v[ 4], mDx2), + dP_xp = WENO5(v[ 6]-v[ 5], v[ 5]-v[ 4], v[ 4]-v[ 0], v[ 0]-v[ 3], v[ 3]-v[ 2], mDx2), + dP_ym = WENO5(v[ 8]-v[ 7], v[ 9]-v[ 8], v[ 0]-v[ 9], v[10]-v[ 0], v[11]-v[10], mDx2), + dP_yp = WENO5(v[12]-v[11], v[11]-v[10], v[10]-v[ 0], v[ 0]-v[ 9], v[ 9]-v[ 8], mDx2), + dP_zm = WENO5(v[14]-v[13], v[15]-v[14], v[ 0]-v[15], v[16]-v[ 0], v[17]-v[16], mDx2), + dP_zp = WENO5(v[18]-v[17], v[17]-v[16], v[16]-v[ 0], v[ 0]-v[15], v[15]-v[14], mDx2); + + return T(mInvDx2) * GodunovsNormSqrd(v[0] > T(isoValue), + dP_xm, dP_xp, + dP_ym, dP_yp, + dP_zm, dP_zp); + } +}; + +} // namespace math +} // namespace nanovdb diff --git a/simd_test/lift_test.cpp b/simd_test/lift_test.cpp index a697cd3d2e..297e733063 100644 --- a/simd_test/lift_test.cpp +++ b/simd_test/lift_test.cpp @@ -1,97 +1,9 @@ -#include "Simd.h" +#include "StencilKernel.h" #include #include using namespace nanovdb::util; - -// --------------------------------------------------------------------------- -// WENO5 upwind interpolation — templated on T -// T = float : scalar, usable as __hostdev__ on GPU -// T = Simd : W-wide vectorized version on CPU -// --------------------------------------------------------------------------- -template -T weno5(T v1, T v2, T v3, T v4, T v5, float dx2 = 1.f) -{ - const float C = 13.f / 12.f; - const T eps = T(1.0e-6f * dx2); - - const T d12 = v1 - 2.f*v2 + v3; - const T d13 = v1 - 4.f*v2 + 3.f*v3; - const T d23 = v2 - 2.f*v3 + v4; - const T d24 = v2 - v4; - const T d34 = v3 - 2.f*v4 + v5; - const T d35 = 3.f*v3 - 4.f*v4 + v5; - - const T w1 = C*d12*d12 + 0.25f*d13*d13 + eps; - const T w2 = C*d23*d23 + 0.25f*d24*d24 + eps; - const T w3 = C*d34*d34 + 0.25f*d35*d35 + eps; - - const T A1 = 0.1f / (w1*w1); - const T A2 = 0.6f / (w2*w2); - const T A3 = 0.3f / (w3*w3); - - return (A1*(2.f*v1 - 7.f*v2 + 11.f*v3) + - A2*(5.f*v3 - v2 + 2.f*v4) + - A3*(2.f*v3 + 5.f*v4 - v5)) / (6.f*(A1+A2+A3)); -} - -// --------------------------------------------------------------------------- -// godunovsNormSqrd — templated on T (value type) and MaskT (comparison result) -// MaskT = bool when T = float (GPU / scalar path) -// MaskT = SimdMask when T = Simd (CPU SIMD path) -// Mirrors GodunovsNormSqrd in nanovdb/math/Stencils.h -// --------------------------------------------------------------------------- -template -T godunovsNormSqrd(MaskT isOutside, - T dP_xm, T dP_xp, - T dP_ym, T dP_yp, - T dP_zm, T dP_zp) -{ - const T zero(0.f); - T outside = max(max(dP_xm, zero) * max(dP_xm, zero), - min(dP_xp, zero) * min(dP_xp, zero)) // (dP/dx)^2 - + max(max(dP_ym, zero) * max(dP_ym, zero), - min(dP_yp, zero) * min(dP_yp, zero)) // (dP/dy)^2 - + max(max(dP_zm, zero) * max(dP_zm, zero), - min(dP_zp, zero) * min(dP_zp, zero)); // (dP/dz)^2 - - T inside = max(min(dP_xm, zero) * min(dP_xm, zero), - max(dP_xp, zero) * max(dP_xp, zero)) // (dP/dx)^2 - + max(min(dP_ym, zero) * min(dP_ym, zero), - max(dP_yp, zero) * max(dP_yp, zero)) // (dP/dy)^2 - + max(min(dP_zm, zero) * min(dP_zm, zero), - max(dP_zp, zero) * max(dP_zp, zero)); // (dP/dz)^2 - - return where(isOutside, outside, inside); -} - -// --------------------------------------------------------------------------- -// normSqGrad — templated on T -// Input layout matches WenoStencil / WenoPt::idx (19 values): -// v0 = center ( 0, 0, 0) -// v1..v6 = x-axis (-3,-2,-1, +1,+2,+3) -// v7..v12 = y-axis (-3,-2,-1, +1,+2,+3) -// v13..v18 = z-axis (-3,-2,-1, +1,+2,+3) -// --------------------------------------------------------------------------- -template -T normSqGrad(T v0, - T v1, T v2, T v3, T v4, T v5, T v6, - T v7, T v8, T v9, T v10, T v11, T v12, - T v13, T v14, T v15, T v16, T v17, T v18, - float dx2, float invDx2, float isoValue = 0.f) -{ - const T dP_xm = weno5(v2 -v1, v3 -v2, v0 -v3, v4 -v0, v5 -v4, dx2); - const T dP_xp = weno5(v6 -v5, v5 -v4, v4 -v0, v0 -v3, v3 -v2, dx2); - const T dP_ym = weno5(v8 -v7, v9 -v8, v0 -v9, v10-v0, v11-v10, dx2); - const T dP_yp = weno5(v12-v11, v11-v10, v10-v0, v0 -v9, v9 -v8, dx2); - const T dP_zm = weno5(v14-v13, v15-v14, v0 -v15, v16-v0, v17-v16, dx2); - const T dP_zp = weno5(v18-v17, v17-v16, v16-v0, v0 -v15, v15-v14, dx2); - - return invDx2 * godunovsNormSqrd(v0 > T(isoValue), - dP_xm, dP_xp, - dP_ym, dP_yp, - dP_zm, dP_zp); -} +using namespace nanovdb::math; // --------------------------------------------------------------------------- // SIMD wrapper — noinline to prevent constant-folding in the test @@ -100,32 +12,27 @@ constexpr int W = 16; using FloatSimd = Simd; __attribute__((noinline)) -FloatSimd runSimdNormSqGrad(const FloatSimd sv[19], - float dx2, float invDx2, float isoValue) +FloatSimd runSimdNormSqGrad(const FloatSimd sv[19], float dx, float isoValue) { - return normSqGrad( - sv[0], sv[1], sv[2], sv[3], sv[4], sv[5], sv[6], - sv[7], sv[8], sv[9], sv[10], sv[11], sv[12], - sv[13], sv[14], sv[15], sv[16], sv[17], sv[18], - dx2, invDx2, isoValue); + WenoStencilKernel sk(dx); + for (int n = 0; n < 19; n++) sk[n] = sv[n]; + return sk.normSqGrad(isoValue); } // --------------------------------------------------------------------------- -// Reference: scalar path — same kernel instantiated with T=float +// Reference: scalar path — same kernel class instantiated with T=float // --------------------------------------------------------------------------- -float refNormSqGrad(const float v[19], float dx2, float invDx2, float isoValue = 0.f) +float refNormSqGrad(const float v[19], float dx, float isoValue = 0.f) { - return normSqGrad( - v[0], v[1], v[2], v[3], v[4], v[5], v[6], - v[7], v[8], v[9], v[10], v[11], v[12], - v[13], v[14], v[15], v[16], v[17], v[18], - dx2, invDx2, isoValue); + WenoStencilKernel sk(dx); + for (int n = 0; n < 19; n++) sk[n] = v[n]; + return sk.normSqGrad(isoValue); } // --------------------------------------------------------------------------- int main() { - const float dx = 0.1f, dx2 = dx*dx, invDx2 = 1.f/(dx*dx); + const float dx = 0.1f; // Storage: SoA layout — inData[n] holds W lane values for stencil position n float inData[19][W]{}; @@ -140,14 +47,14 @@ int main() // Load into Simd — each FloatSimd holds one stencil position across all W lanes FloatSimd sv[19]; for (int n = 0; n < 19; n++) - sv[n] = FloatSimd(inData[n]); + sv[n] = FloatSimd(inData[n], element_aligned); - FloatSimd result = runSimdNormSqGrad(sv, dx2, invDx2, 0.f); + FloatSimd result = runSimdNormSqGrad(sv, dx, 0.f); printf("WenoNormSqGrad full 3-axis (W=%d, dx=%.2f):\n", W, dx); bool allOk = true; for (int i = 0; i < W; i++) { - float ref = refNormSqGrad(refValues[i], dx2, invDx2, 0.f); + float ref = refNormSqGrad(refValues[i], dx, 0.f); float got = result[i]; bool ok = std::abs(got - ref) < 1e-5f * std::abs(ref) + 1e-10f; printf(" lane %2d: %12.6f ref: %12.6f %s\n", i, got, ref, ok ? "OK" : "FAIL"); From 42e54cc7123b6d7dba97901fde57f0934f39291a Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Sat, 11 Apr 2026 03:17:19 -0500 Subject: [PATCH 12/60] StencilGather.md: document CPU batch leaf-ptr design (probedMask, batchPtrs) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add full CPU batch neighbor-leaf resolution design to the planning doc: - §6: Replace StencilLeafPtrs struct with layered design — shared 3×3×3 bit encoding for probedMask/ptrs[27], stencil-specific batchPtrs population (batchPtrs[4][SIMDw] for WENO5, batchPtrs[3][3][3][SIMDw] for box stencil), and GPU scalar design note kept separate. - §8d: Update lazy-probe section to reference ptrs[27] and 27-bit probedMask; add batchPtrs population step (Phase 2) after the probeLeaf loop. - §8e: Update computeNeededDirs direction table to use 3×3×3 bit positions (bits 4,10,12,14,16,22 for WENO5 face neighbors). - §8f/§8g: Minor notation updates to match ptrs[27] naming. - §9: Resolve ptrs-layout and nExtraLeaves open questions; add prototype scope. Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Efty Sifakis --- .../StencilGather.md | 380 ++++++++++++++++-- 1 file changed, 349 insertions(+), 31 deletions(-) diff --git a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilGather.md b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilGather.md index 467900b86c..9ffca20e10 100644 --- a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilGather.md +++ b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilGather.md @@ -215,28 +215,84 @@ computeStencil(leaf, voxelOffset, leafPtrs, data[N]) --- -## 6. StencilLeafPtrs Struct +## 6. Neighbor Direction Encoding and Leaf Pointer Tables -Unified template parameterized on build type and leaf pointer type, enabling both -scalar (GPU) and batched (CPU) instantiations from one definition: +### 6a. Shared 3×3×3 Bit Encoding + +All stencil types use the same flat bit encoding for neighbor directions, based on +the 3×3×3 cube of immediately adjacent leaves: + +``` +bit(dx, dy, dz) = (dx+1)*9 + (dy+1)*3 + (dz+1) +``` + +where `(dx, dy, dz) ∈ {-1, 0, +1}³`. This yields 27 bits total, fitting in a +`uint32_t`. Bit 13 is the center `(0,0,0)` — always implicit, never probed. + +``` +neighborCoord(centerCoord, bit): + dx = bit/9 - 1, dy = (bit/3)%3 - 1, dz = bit%3 - 1 + return centerCoord + Coord(dx*8, dy*8, dz*8) // leaf origin offset +``` + +The six WENO5 face-neighbor bits are a strict subset of the 27: + +| Direction | (dx,dy,dz) | bit | +|-----------|-----------|-----| +| x-lo | (-1, 0, 0) | **4** | +| y-lo | ( 0,-1, 0) | **10** | +| z-lo | ( 0, 0,-1) | **12** | +| z-hi | ( 0, 0,+1) | **14** | +| y-hi | ( 0,+1, 0) | **16** | +| x-hi | (+1, 0, 0) | **22** | + +For the box stencil, all 26 non-center bits may be set. The encoding is identical; +only the set of active bits differs. + +### 6b. Common Per-Leaf Canonical State (CPU) + +State that persists across batches within one center leaf: ```cpp -template -struct StencilLeafPtrs { - LeafPtrT ptrs[3][3]; // [axis][slot]: slot 0=lo, 1=center, 2=hi -}; +uint32_t probedMask = 0; // bit d set ↔ direction d has been probed this leaf +const LeafT* ptrs[27] = {}; // canonical neighbor table; ptrs[13] unused (center) +Coord centerLeafCoord; ``` -- **GPU** (scalar per thread): `LeafPtrT = const NanoLeaf*` -- **CPU batch** (SIMDw lanes): `LeafPtrT = std::array*, SIMDw>` +`ptrs[]` is populated lazily by the `probeLeaf` loop (§8d). For WENO5, only the +six face-direction entries (bits 4,10,12,14,16,22) are ever non-null; the 21 +edge/corner entries remain null throughout. + +### 6c. Stencil-Specific Per-Batch SIMD Table + +After the probeLeaf loop fills `ptrs[27]`, the relevant entries are broadcast into a +per-lane SIMD table whose layout is stencil-specific: -The `ptrs[3][3]` shape is correct for stencils with R ≤ 3. Larger stencils would -require a different slot count. +**WENO5 — `batchPtrs[4][SIMDw]`** (center + one per axis): +- `[0][i]` — center leaf (uniform broadcast of `¤tLeaf`) +- `[1][i]` — x-axis neighbor: `ptrs[4]` if `lx < R`, `ptrs[22]` if `lx >= 8-R`, else `nullptr` +- `[2][i]` — y-axis neighbor: `ptrs[10]` / `ptrs[16]` / `nullptr` +- `[3][i]` — z-axis neighbor: `ptrs[12]` / `ptrs[14]` / `nullptr` -The current GPU draft in `VoxelBlockManager.cuh` uses an unparameterized -`WenoLeafPtrs` (GPU-only, WENO5-specific). Generalizing to -`StencilLeafPtrs` is a prerequisite for the CPU implementation -and for supporting additional stencil shapes. +The broadcast is masked: a scalar `ptrs[bit]` value is written into lane `i` under +the condition that lane `i`'s local coordinate requires that direction. The +lo/hi decision is encoded in the ptr value itself — `computeStencil` does not need +to distinguish lo from hi at index-computation time. + +**Box stencil — `batchPtrs[3][3][3][SIMDw]`**: the full 27-entry cube, per lane. +Population follows the same masked-broadcast pattern, driven by each lane's +`(lx, ly, lz)` relative to leaf boundaries. + +This compaction is the step that bridges the shared scalar probeLeaf machinery (§8d) +and the SIMD stencil index computation (§8g). + +### 6d. GPU Scalar Design (Unchanged) + +The GPU per-thread design uses `ptrs[3][3]` (axis × {lo, center, hi}) and probes +all needed directions unconditionally on entry — acceptable because each GPU thread +handles one voxel and the probe count is bounded by 3. The GPU design does not +use `probedMask` or the 27-bit encoding. Both CPU and GPU designs resolve neighbor +leaves via `probeLeaf`; the machinery diverges only in batch vs. scalar granularity. --- @@ -266,19 +322,265 @@ per-thread and divergence-safe. Process voxels in batches of `SIMDw = 16`. With AVX2 (16 × uint16_t per register), each batch maps to one SIMD register width for `voxelOffset`. -### 8b. probeLeaf Deduplication +### 8b. Scan-Order Coherence and Expected probeLeaf Count + +NanoVDB linearizes active voxels **z-fast, y-medium, x-slow** (offset = x×64 + y×8 + z). +This means consecutive sequential active voxels vary z fastest and x slowest. The +expected intra-leaf distribution across a batch of SIMDw=16 at ~50% leaf density +(~256 active voxels per leaf): + +- 16 active voxels span ~32 scan positions → one fixed intra-leaf **x** value, + ~4 consecutive **y** values, and all 8 **z** values covered. + +This axis asymmetry determines the expected number of **unique** probeLeaf calls +per batch after deduplication: + +| Axis | Reason | Expected unique probes | +|------|--------|----------------------| +| x | All 16 voxels at same intra-leaf x; need lo or hi but not both | **≈ 0.75** | +| y | Spans ~4 y values; may straddle lo/hi boundary | **≈ 1.2** | +| z | All 8 z values present; always needs both z-lo and z-hi | **≈ 2** (deterministic) | + +**Total expected unique probeLeaf calls per batch: ~4** (well below the theoretical +maximum of 6). + +For stencils with R ≤ 3 (WENO5), a voxel at intra-leaf position p needs the +lo neighbor when p < R and the hi neighbor when p > (LeafDim - 1 - R). +For R=3, LeafDim=8: lo needed for p ∈ {0,1,2}, hi for p ∈ {5,6,7}. + +At lower leaf densities the batch spans more leaves and the expected count rises +toward 6; at higher densities it falls toward 2 (x and y each converge to 1, z stays 2). + +### 8c. ReadAccessor: Cache Behavior for probeLeaf + +The NanoVDB `DefaultReadAccessor` (`ReadAccessor`) stores +three independent single-slot caches: one per tree level (leaf/lower/upper). The +`get` dispatch checks **only the cache at `OpT::LEVEL`**, as an `if constexpr` +chain: + +```cpp +if constexpr(OpT::LEVEL <= 0) { + if (isCached(ijk)) return leaf->getAndCache(...); // leaf hit +} else if constexpr(OpT::LEVEL <= 1) { ... } // compiled away for GetLeaf + else if constexpr(OpT::LEVEL <= 2) { ... } // compiled away for GetLeaf +return mRoot->getAndCache(ijk, *this); // leaf miss → full traversal +``` + +For `GetLeaf` (LEVEL=0), the compiled code is exactly two paths: + +- **Leaf cache hit**: `isCached` check (3 masked comparisons) + return + `mNode[0]`. No memory loads beyond the accessor struct. Cost: ~6 integer + instructions, essentially free. + +- **Leaf cache miss**: falls directly to `mRoot->getAndCache` — a **full + root-to-leaf traversal**, identical in cost to `tree.probeLeaf(ijk)`. The lower + and upper node caches (`mNode[1]`, `mNode[2]`) are **not consulted** for LEVEL=0 + operations; they are populated as a side effect of the traversal but never read + back for subsequent `get` calls. + +This is a deliberate NanoVDB design choice (simpler code, better GPU SIMT behavior). +It differs from OpenVDB's `ValueAccessor3`, which does check lower/upper caches on +a leaf miss and can short-circuit traversal from a cached lower node. + +**Implications for probeLeaf in the stencil gather:** + +The ReadAccessor only helps `probeLeaf` when consecutive calls land in the **same +leaf**. For calls targeting different leaves — even adjacent leaves in the same +lower node — it is a full root traversal each time. + +**Accessor granularity:** Use **one `DefaultReadAccessor` per CPU thread**, +constructed once before the block loop and reused across all blocks and all axes. +Per-axis accessors would each pay a cold traversal for their first probe in a batch, +losing the cross-axis leaf-cache sharing (in the typical single-leaf batch, one probe +warms the leaf and all subsequent probes across any axis that happen to need the same +leaf get the hit for free). Per-block construction discards carryover between +consecutive blocks, which is wasteful since consecutive blocks process spatially +adjacent leaves. + +### 8d. Neighbor Leaf Resolution — Lazy Probe with Per-Leaf Cache + +**Why not unconditional probing:** an alternative design probes all `NUM_DIRS` +neighbor directions when the center leaf changes, caching the full pointer table +upfront. For WENO5 (6 face-neighbor directions) this is only marginally wasteful. +For the box stencil (26 directions: 6 faces + 12 edges + 8 corners), most batches +are interior and never touch edge or corner leaves; unconditionally probing all 26 +would waste ~15–20 probeLeaf calls per center leaf. + +**Why not naive per-voxel accessor use:** calling `acc.probeLeaf` for every lane +without deduplication causes leaf-cache thrashing at every y-row boundary (the cache +alternates between z-lo and z-hi at each transition). For 4 y-rows per batch, the +z-direction alone produces ~8 full traversals instead of 2. Not recommended. -Within a batch of SIMDw=16 voxels, the neighbor coordinate along each axis (rounded -to leaf granularity) takes at most **2 distinct values** per axis. The result of each -`probeLeaf` call is broadcast to the lanes sharing that neighbor coordinate. +**Design: lazy probe with per-leaf `probedMask`.** -For a stencil with R ≤ 3: ≤ 2 `probeLeaf` calls per axis × 3 axes = -**≤ 6 `probeLeaf` calls per batch** (vs up to 3×SIMDw for naive per-voxel approach). +State that persists across all batches within the same center leaf (see §6b): -The deduplication bound depends on both SIMDw and leaf size (8). For larger SIMDw -or larger R, more distinct neighbor coordinates can appear per batch. +```cpp +uint32_t probedMask = 0; // 27-bit; bit = (dx+1)*9 + (dy+1)*3 + (dz+1) +const LeafT* ptrs[27] = {}; // canonical neighbor table (§6a); center implicit +Coord centerLeafCoord; +``` + +Per-batch logic — Phase 1 (probeLeaf): + +```cpp +uint32_t neededMask = computeNeededDirs(voxelOffset_batch, laneMask); // §8e +uint32_t toProbe = neededMask & ~probedMask; // needed AND not yet cached + +while (toProbe) { + int d = __builtin_ctz(toProbe); // position of lowest set bit + ptrs[d] = acc.get(neighborCoord(centerLeafCoord, d)); + probedMask |= (1u << d); + toProbe &= toProbe - 1; // clear lowest set bit +} +``` + +Per-batch — Phase 2 (populate stencil-specific `batchPtrs` from `ptrs[27]`): + +```cpp +// WENO5 example: +const LeafT* batchPtrs[4][SIMDw]; +for (int i = 0; i < SIMDw; i++) batchPtrs[0][i] = ¤tLeaf; +for (int i = 0; i < SIMDw; i++) { + int lx = voxelOffset[b+i] >> 6; + batchPtrs[1][i] = (lx < R) ? ptrs[4] : (lx >= 8-R) ? ptrs[22] : nullptr; + int ly = (voxelOffset[b+i] >> 3) & 7; + batchPtrs[2][i] = (ly < R) ? ptrs[10] : (ly >= 8-R) ? ptrs[16] : nullptr; + int lz = voxelOffset[b+i] & 7; + batchPtrs[3][i] = (lz < R) ? ptrs[12] : (lz >= 8-R) ? ptrs[14] : nullptr; +} +``` + +Then `computeStencil(batchPtrs, voxelOffset + b, data + b)` (§8g). + +On center leaf advance (`currentLeafID++`): + +```cpp +probedMask = 0; +centerLeafCoord = tree.getFirstNode<0>()[currentLeafID].origin(); +// stale ptrs[] entries are harmless; probedMask=0 guarantees re-probe before use +``` + +`probedMask` persists across batch boundaries. A direction probed during batch k +is not re-probed during batch k+1 if the center leaf has not changed. Total +probeLeaf calls per center leaf = number of distinct directions needed across all +batches in that leaf, always ≤ 26 (≤ 6 for WENO5). + +**Where the ReadAccessor genuinely earns its keep:** the `getValue` calls inside +`computeStencil` that fetch N stencil values per voxel. Many of these land in the +same leaf repeatedly. One accessor per thread, reused across the entire block loop, +accumulates leaf-cache hits throughout the computation. + +### 8e. `computeNeededDirs` — Stencil-Specific Batch Probe Mask + +```cpp +uint32_t computeNeededDirs(const uint16_t* voxelOffset, uint32_t laneMask); +``` -### 8c. computeStencil Vectorization +Inspects the `voxelOffset` values of active lanes and returns a bitmask of directions +whose neighbor leaf is accessed by at least one lane. This is purely arithmetic on +voxelOffsets — no tree access, no probeLeaf. + +Direction bits use the shared 3×3×3 encoding from §6a: +`bit(dx,dy,dz) = (dx+1)*9 + (dy+1)*3 + (dz+1)`. + +**Direction encoding (WENO5, 6 active bits out of 27):** + +| Bit | Direction | (dx,dy,dz) | Condition (per lane) | +|-----|-----------|-----------|-----------------------------------| +| 4 | x-lo | (-1,0,0) | `(vo >> 6) < R` | +| 10 | y-lo | (0,-1,0) | `((vo >> 3) & 0x7) < R` | +| 12 | z-lo | (0,0,-1) | `(vo & 0x7) < R` | +| 14 | z-hi | (0,0,+1) | `(vo & 0x7) >= (8 - R)` | +| 16 | y-hi | (0,+1,0) | `((vo >> 3) & 0x7) >= (8 - R)` | +| 22 | x-hi | (+1,0,0) | `(vo >> 6) >= (8 - R)` | + +For WENO5 (R=3): z-lo when lz ∈ {0,1,2}, z-hi when lz ∈ {5,6,7}. Each condition +is a threshold comparison across SIMDw lanes, folded with an `any()` reduction. +The remaining 21 bits of `neededMask` are always zero for WENO5. + +**Box stencil (R=1, up to 26 active bits):** face directions use the same six bit +positions with thresholds 0 and 7. Edge directions (e.g. (-1,-1,0) → bit 1) +require a pairwise AND: `any(lx == 0 && ly == 0)`. Corner directions require all +three simultaneously. Same mechanism throughout — all pure arithmetic on +voxelOffsets, same `uint32_t` mask type. + +`computeNeededDirs` is the only function that encodes knowledge of the stencil's +reach R and how offsets map to neighbor leaves. It is written once per stencil +shape and is small (≤ 20 SIMD instructions for WENO5). + +### 8f. CPU Block-Level Loop Structure + +**Block dispatch using `nExtraLeaves`.** + +`nExtraLeaves` is the popcount of the entire block's `jumpMap` — already computed +inside `decodeInverseMaps` as the loop bound for the leaf-iteration pass: + +```cpp +int nExtraLeaves = 0; +for (int i = 0; i < JumpMapLength; i++) + nExtraLeaves += util::countOn(jumpMap[i]); +``` + +`nExtraLeaves + 1` equals the total number of center leaves touched within this +block. This value is a natural block-level dispatch condition: + +- `nExtraLeaves == 0`: entire block is single-leaf. No `currentLeafID` advances, + no straddle batches. Can specialize the inner loop to eliminate dead branches. +- `nExtraLeaves >= 1`: at least one leaf transition. At most `nExtraLeaves` straddle + batches exist; all other batches are single-leaf. + +**Loop skeleton (general path):** + +```cpp +uint32_t currentLeafID = firstLeafID; +uint32_t probedMask = 0; +const LeafT* ptrs[NUM_DIRS] = {}; +Coord centerLeafCoord = tree.getFirstNode<0>()[currentLeafID].origin(); + +for (int b = 0; b < BlockWidth; b += SIMDw) { + uint32_t activeMask = non_sentinel_mask(leafIndex + b); + if (!activeMask) continue; + + while (activeMask) { + // Which lanes belong to the current center leaf? + uint32_t leafMask = lanes_equal(leafIndex + b, currentLeafID) & activeMask; + + if (!leafMask) { + // No lanes match: advance to the next leaf + currentLeafID++; + probedMask = 0; + centerLeafCoord = tree.getFirstNode<0>()[currentLeafID].origin(); + continue; + } + + // Probe any newly needed neighbors + uint32_t neededMask = computeNeededDirs(voxelOffset + b, leafMask); + uint32_t toProbe = neededMask & ~probedMask; + while (toProbe) { + int d = __builtin_ctz(toProbe); + ptrs[d] = acc.get(neighborCoord(centerLeafCoord, d)); + probedMask |= (1u << d); + toProbe &= toProbe - 1; + } + + computeStencil(leafMask, ptrs, voxelOffset + b, data + b); + activeMask &= ~leafMask; + } +} +``` + +**Key invariants:** +- `currentLeafID` is monotonically non-decreasing across the entire block; it + advances at most `nExtraLeaves` times. +- `probedMask` is reset only when `currentLeafID` changes — not on every batch. + Directions probed in earlier batches stay cached. +- For single-leaf blocks, the `if (!leafMask)` branch is dead, `currentLeafID` + never changes, and `probedMask` accumulates across all batches in the block. +- For straddle batches, the `while (activeMask)` iterates twice (once per leaf + present in the batch), each time consuming its subset of lanes. + +### 8g. computeStencil Vectorization The outer loop over lanes (i = 0 .. SIMDw-1) calls `computeStencil` once per lane with output into a SoA `stencilData[N][SIMDw]` array. Auto-vectorization strategy: @@ -307,10 +609,26 @@ with output into a SoA `stencilData[N][SIMDw]` array. Auto-vectorization strate lambda's return type via `decltype` / CTAD, or supplied as an explicit template parameter alongside the stencil type. -- **CPU `resolveLeafPtrs` batch function**: the per-batch deduplication logic (§8b) - needs its own function, separate from the GPU scalar `resolveLeafPtrs`. Signature - and deduplication algorithm TBD. - -- **Generalizing beyond R ≤ 3**: the `ptrs[3][3]` struct and single-neighbor-per-axis - assumption are baked into the current design. Any stencil with R > 4 would require - revisiting §5a and §6. +- **`ptrs[]` layout — GPU vs CPU divergence**: the GPU design keeps `ptrs[3][3]` + (axis × {lo,center,hi}), probing unconditionally per thread. The CPU design uses + the canonical `ptrs[27]` + `probedMask` (§6b) as common infrastructure, then + populates a stencil-specific `batchPtrs` (§6c). These two designs are intentionally + separate; no unification is needed. + +- **`nExtraLeaves` surfacing**: recomputed cheaply from the block's jumpMap after + `decodeInverseMaps` returns (popcount loop, same as the internal loop bound). + `decodeInverseMaps` API is not modified — avoids CPU/GPU asymmetry. + +- **Prototype — immediate next step**: `stencil_gather_cpu.cpp` in + `ex_voxelBlockManager_host_cuda/`. Scope: + - Generate domain with `generateDomain` (reuse from `vbm_host_cuda.cpp`). + - Build VBM. Iterate over blocks; call `decodeInverseMaps` per block. + - For each batch: run the full §8d probeLeaf + `batchPtrs[4][SIMDw]` population. + - Verification only (no `computeStencil`): for each active lane, walk all 19 + WENO5 offsets, check that `batchPtrs[axis][i]` matches a direct `probeLeaf` + reference for every neighbor that crosses a leaf boundary. + - Use WENO5 stencil directly (not the simpler 7-pt Laplacian — WENO5 exercises + R=3 boundary conditions and all six face directions). + +- **Generalizing beyond R ≤ 3**: the single-neighbor-per-axis assumption is baked + into the current design. Any stencil with R > 4 would require revisiting §5a and §6. From 536467ef5825040aa4d21d001ca2941e1b11f623 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Sat, 11 Apr 2026 03:38:44 -0500 Subject: [PATCH 13/60] ex_stencil_gather_cpu: prototype for CPU SIMD stencil gather Phase 1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Standalone CPU-only executable that verifies the neighbor leaf resolution design from StencilGather.md §8d–§8f: - For each VBM block: calls decodeInverseMaps, recomputes nLeaves from jumpMap, then processes SIMDw=16 batches with the full probedMask / lazy-probeLeaf / batchPtrs[4][SIMDw] pipeline. - Does not call computeStencil. Instead verifies batchPtrs against a direct probeLeaf reference for all 18 non-center WENO5 stencil offsets that cross leaf boundaries. - Passes at 0.1, 0.25, 0.5, 0.9 occupancy (2.3M–2.9M lane checks each). Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Efty Sifakis --- nanovdb/nanovdb/examples/CMakeLists.txt | 7 + .../stencil_gather_cpu.cpp | 431 ++++++++++++++++++ 2 files changed, 438 insertions(+) create mode 100644 nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp diff --git a/nanovdb/nanovdb/examples/CMakeLists.txt b/nanovdb/nanovdb/examples/CMakeLists.txt index 898a412987..33d1cab330 100644 --- a/nanovdb/nanovdb/examples/CMakeLists.txt +++ b/nanovdb/nanovdb/examples/CMakeLists.txt @@ -121,6 +121,13 @@ if(TARGET ex_voxelBlockManager_host_cuda) $<$:-mavx2 -fopenmp-simd>) endif() +# CPU-only SIMD stencil gather prototype (Phase 1: neighbor leaf resolution). +# No CUDA required. Design in ex_voxelBlockManager_host_cuda/StencilGather.md. +nanovdb_example(NAME "ex_stencil_gather_cpu") +if(TARGET ex_stencil_gather_cpu) + target_compile_options(ex_stencil_gather_cpu PRIVATE -mavx2 -fopenmp-simd) +endif() + if(CUDAToolkit_FOUND) nanovdb_example(NAME "ex_make_mgpu_nanovdb") # requires cuRAND target_link_libraries(ex_make_mgpu_nanovdb PRIVATE CUDA::curand) diff --git a/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp b/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp new file mode 100644 index 0000000000..0a898009b5 --- /dev/null +++ b/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp @@ -0,0 +1,431 @@ +// Copyright Contributors to the OpenVDB Project +// SPDX-License-Identifier: Apache-2.0 + +/*! + \file stencil_gather_cpu.cpp + + \brief Prototype for CPU SIMD stencil gather — Phase 1 only: + neighbor leaf resolution with lazy probeLeaf and per-leaf probedMask cache. + + Design documented in: + nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilGather.md + + What this prototype does (and does NOT do): + - Generates a random Morton-layout domain (same as vbm_host_cuda). + - Builds a ValueOnIndex NanoVDB grid and a VoxelBlockManager. + - For every block: calls decodeInverseMaps, then processes SIMD batches of + SIMDw=16 lanes, running the full probedMask / probeLeaf / batchPtrs population + pipeline described in StencilGather.md §8d–§8f. + - Does NOT call computeStencil. Instead, verifies that batchPtrs[4][SIMDw] + is correct for every active lane: for each of the 18 non-center WENO5 + stencil offsets, if the offset crosses a leaf boundary the corresponding + batchPtrs[axis+1][lane] is checked against a direct probeLeaf reference. + + Build: + Configured via CMakeLists.txt in the parent examples/ directory. + No CUDA required; CPU-only. + + Usage: stencil_gather_cpu [ambient_voxels [occupancy]] +*/ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +// ============================================================ +// Constants and type aliases +// ============================================================ + +static constexpr int Log2BlockWidth = 7; +static constexpr int BlockWidth = 1 << Log2BlockWidth; // 128 +static constexpr int SIMDw = 16; // batch width +static constexpr int R = 3; // WENO5 stencil reach (±3) + +using BuildT = nanovdb::ValueOnIndex; +using GridT = nanovdb::NanoGrid; +using LeafT = nanovdb::NanoLeaf; +using CPUVBM = nanovdb::tools::VoxelBlockManager; +using AccT = nanovdb::DefaultReadAccessor; + +// Direction bit encoding shared across all stencil types: +// bit(dx, dy, dz) = (dx+1)*9 + (dy+1)*3 + (dz+1), dx,dy,dz ∈ {-1,0,+1} +// +// WENO5 face-neighbor bits (the only 6 bits ever set for WENO5): +static constexpr int kLoBit[3] = {4, 10, 12}; // x-lo, y-lo, z-lo +static constexpr int kHiBit[3] = {22, 16, 14}; // x-hi, y-hi, z-hi + +// ============================================================ +// Test domain generation (mirrors vbm_host_cuda.cpp) +// ============================================================ + +static uint32_t coordinate_bitpack(uint32_t x) +{ + x &= 0x49249249; + x |= (x >> 2); x &= 0xc30c30c3; + x |= (x >> 4); x &= 0x0f00f00f; + x |= (x >> 8); x &= 0xff0000ff; + x |= (x >> 16); x &= 0x0000ffff; + return x; +} + +static std::vector +generateDomain(int ambient_voxels, float occupancy, uint32_t seed = 42) +{ + const int target = (int)(occupancy * (float)ambient_voxels); + std::mt19937 rng(seed); + std::uniform_int_distribution dist(0, ambient_voxels - 1); + std::vector voxmap(ambient_voxels, false); + int active = 0; + while (active < target) { + int i = dist(rng); + if (!voxmap[i]) { voxmap[i] = true; ++active; } + } + std::vector coords; + coords.reserve(active); + for (int i = 0; i < ambient_voxels; ++i) { + if (voxmap[i]) { + coords.emplace_back( + (int)coordinate_bitpack( i & 0x49249249), + (int)coordinate_bitpack((i >> 1) & 0x49249249), + (int)coordinate_bitpack((i >> 2) & 0x49249249)); + } + } + return coords; +} + +// ============================================================ +// Neighbor direction utilities (§6a) +// ============================================================ + +/// @brief Return the origin of the neighbor leaf at direction bit d from center. +/// bit(dx,dy,dz) = (dx+1)*9 + (dy+1)*3 + (dz+1); leaf stride = 8 per axis. +static inline nanovdb::Coord neighborLeafOrigin(const nanovdb::Coord& center, int bit) +{ + const int dx = bit / 9 - 1; + const int dy = (bit / 3) % 3 - 1; + const int dz = bit % 3 - 1; + return center + nanovdb::Coord(dx * 8, dy * 8, dz * 8); +} + +/// @brief Extract the local axis coordinate from a voxelOffset. +/// NanoVDB leaf layout: offset = lx*64 + ly*8 + lz. +/// axis 0 (x): bits [8:6] → shift 6 +/// axis 1 (y): bits [5:3] → shift 3 +/// axis 2 (z): bits [2:0] → shift 0 +static inline int localAxisCoord(uint16_t vo, int axis) +{ + return (vo >> (6 - 3 * axis)) & 7; +} + +// ============================================================ +// computeNeededDirs (§8e) +// ============================================================ + +/// @brief Return a 27-bit probedMask bitmask of neighbor directions accessed by +/// any lane set in laneMask. For WENO5 (R=3) only the 6 face-direction bits +/// {4,10,12,14,16,22} can ever be set; the 21 edge/corner bits remain zero. +static uint32_t computeNeededDirs(const uint16_t* voxelOffset, + int batchStart, + uint32_t laneMask) +{ + uint32_t needed = 0; + for (int i = 0; i < SIMDw; i++) { + if (!(laneMask & (1u << i))) continue; + const uint16_t vo = voxelOffset[batchStart + i]; + for (int axis = 0; axis < 3; axis++) { + const int lc = localAxisCoord(vo, axis); + if (lc < R) needed |= (1u << kLoBit[axis]); + if (lc >= 8-R) needed |= (1u << kHiBit[axis]); + } + } + return needed; +} + +// ============================================================ +// Verification +// ============================================================ + +struct VerifyStats { + uint64_t laneChecks = 0; // stencil-point/lane combinations inspected + uint64_t errors = 0; +}; + +// 18 non-center WENO5 stencil offsets {axis, delta}. +// Each point moves strictly along one axis (axis-aligned stencil). +static constexpr int kWeno5Offsets[18][2] = { + {0,-3},{0,-2},{0,-1},{0,+1},{0,+2},{0,+3}, // x-axis + {1,-3},{1,-2},{1,-1},{1,+1},{1,+2},{1,+3}, // y-axis + {2,-3},{2,-2},{2,-1},{2,+1},{2,+2},{2,+3}, // z-axis +}; + +/// @brief For every active lane (set in laneMask), walk the 18 non-center WENO5 +/// stencil offsets. For offsets that cross a leaf boundary, confirm that +/// batchPtrs[axis+1][lane] matches a direct probeLeaf reference. +/// Also confirms that batchPtrs[0][lane] == &firstLeaf[leafIndex[batchStart+lane]]. +static void verifyBatchPtrs( + const LeafT* const (&batchPtrs)[4][SIMDw], + const LeafT* firstLeaf, + const uint32_t* leafIndex, + const uint16_t* voxelOffset, + int batchStart, + uint32_t laneMask, + AccT& refAcc, + VerifyStats& stats) +{ + for (int i = 0; i < SIMDw; i++) { + if (!(laneMask & (1u << i))) continue; + const int p = batchStart + i; + + const LeafT* centerLeaf = &firstLeaf[leafIndex[p]]; + const nanovdb::Coord cOrig = centerLeaf->origin(); + const uint16_t vo = voxelOffset[p]; + + // Center slot must always point to the center leaf. + stats.laneChecks++; + if (batchPtrs[0][i] != centerLeaf) { + ++stats.errors; + if (stats.errors <= 10) + std::cerr << "CENTER MISMATCH lane=" << i << "\n"; + } + + // Walk each stencil offset. + for (const auto& off : kWeno5Offsets) { + const int axis = off[0]; + const int delta = off[1]; + const int lc = localAxisCoord(vo, axis); + + const bool crossesLo = (lc + delta < 0); + const bool crossesHi = (lc + delta >= 8); + if (!crossesLo && !crossesHi) continue; // stays in center leaf + + // Expected: probe the adjacent leaf in the crossing direction. + const int dirBit = crossesLo ? kLoBit[axis] : kHiBit[axis]; + const nanovdb::Coord nOrig = neighborLeafOrigin(cOrig, dirBit); + const LeafT* expected = refAcc.probeLeaf(nOrig); + const LeafT* actual = batchPtrs[1 + axis][i]; + + stats.laneChecks++; + if (actual != expected) { + ++stats.errors; + if (stats.errors <= 10) { + std::cerr << "MISMATCH: lane=" << i + << " axis=" << axis << " delta=" << delta + << " lc=" << lc + << " expected=" << static_cast(expected) + << " actual=" << static_cast(actual) << "\n"; + } + } + } + } +} + +// ============================================================ +// Main prototype: Phase 1 (neighbor leaf resolution) + verification +// ============================================================ + +static void runPrototype(const GridT* grid, + const nanovdb::tools::VoxelBlockManagerHandle& vbmHandle) +{ + const auto& tree = grid->tree(); + const LeafT* firstLeaf = tree.getFirstNode<0>(); + const uint64_t nVoxels = grid->activeVoxelCount(); + const uint32_t nBlocks = (uint32_t)vbmHandle.blockCount(); + + const uint32_t* firstLeafID = vbmHandle.hostFirstLeafID(); + const uint64_t* jumpMap = vbmHandle.hostJumpMap(); + + // One ReadAccessor per thread, reused across all blocks (§8c). + AccT acc = grid->getAccessor(); + + // Block-local scratch (stack-resident, stays in L1 across batches). + alignas(64) uint32_t leafIndex[BlockWidth]; + alignas(64) uint16_t voxelOffset[BlockWidth]; + + VerifyStats stats; + uint64_t nStraddles = 0; + + for (uint32_t bID = 0; bID < nBlocks; bID++) { + const uint64_t blockFirstOffset = + vbmHandle.firstOffset() + (uint64_t)bID * BlockWidth; + + // Decode inverse maps. + CPUVBM::decodeInverseMaps( + grid, + firstLeafID[bID], + &jumpMap[(uint64_t)bID * CPUVBM::JumpMapLength], + blockFirstOffset, + leafIndex, + voxelOffset); + + // Recompute nLeaves from jumpMap; avoids modifying decodeInverseMaps API + // and keeps CPU/CUDA API symmetric (§9). + int nExtraLeaves = 0; + for (int w = 0; w < CPUVBM::JumpMapLength; w++) + nExtraLeaves += nanovdb::util::countOn( + jumpMap[(uint64_t)bID * CPUVBM::JumpMapLength + w]); + + // Block-level neighbor-leaf resolution state (§8d, §8f). + uint32_t currentLeafID = firstLeafID[bID]; + uint32_t probedMask = 0; + const LeafT* ptrs[27] = {}; + nanovdb::Coord centerLeafCoord = firstLeaf[currentLeafID].origin(); + + // Process SIMD batches. + for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { + + // Build active-lane mask: positions with a valid (non-sentinel) leafIndex. + uint32_t activeMask = 0; + for (int i = 0; i < SIMDw; i++) { + if (leafIndex[batchStart + i] != CPUVBM::UnusedLeafIndex) + activeMask |= (1u << i); + } + if (!activeMask) continue; + + // Track straddle batches for diagnostic output. + for (int i = 0; i < SIMDw; i++) { + if ((activeMask & (1u << i)) && + leafIndex[batchStart + i] != currentLeafID) { + nStraddles++; + break; + } + } + + // Inner loop: consume one center leaf's worth of lanes per iteration. + while (activeMask) { + // Identify lanes belonging to currentLeafID. + uint32_t leafMask = 0; + for (int i = 0; i < SIMDw; i++) { + if ((activeMask & (1u << i)) && + leafIndex[batchStart + i] == currentLeafID) + leafMask |= (1u << i); + } + + if (!leafMask) { + // No lanes for currentLeafID: advance to next leaf. + assert(currentLeafID < firstLeafID[bID] + (uint32_t)nExtraLeaves); + currentLeafID++; + probedMask = 0; + centerLeafCoord = firstLeaf[currentLeafID].origin(); + continue; + } + + // --- Phase 1: probe newly needed neighbor leaves (§8d) --- + const uint32_t neededMask = computeNeededDirs(voxelOffset, batchStart, leafMask); + uint32_t toProbe = neededMask & ~probedMask; + + while (toProbe) { + const int d = __builtin_ctz(toProbe); + ptrs[d] = acc.probeLeaf(neighborLeafOrigin(centerLeafCoord, d)); + probedMask |= (1u << d); + toProbe &= toProbe - 1; + } + + // --- Phase 2: populate per-lane batchPtrs[4][SIMDw] (§6c) --- + // batchPtrs[0][i] = center leaf + // batchPtrs[1][i] = x-axis neighbor (lo, hi, or nullptr) + // batchPtrs[2][i] = y-axis neighbor + // batchPtrs[3][i] = z-axis neighbor + const LeafT* batchPtrs[4][SIMDw] = {}; + for (int i = 0; i < SIMDw; i++) { + if (!(leafMask & (1u << i))) continue; + batchPtrs[0][i] = &firstLeaf[currentLeafID]; + for (int axis = 0; axis < 3; axis++) { + const int lc = localAxisCoord(voxelOffset[batchStart + i], axis); + if (lc < R) + batchPtrs[1 + axis][i] = ptrs[kLoBit[axis]]; + else if (lc >= 8-R) + batchPtrs[1 + axis][i] = ptrs[kHiBit[axis]]; + // else: nullptr (interior lane for this axis) + } + } + + // --- Verification --- + verifyBatchPtrs(batchPtrs, firstLeaf, leafIndex, voxelOffset, + batchStart, leafMask, acc, stats); + + activeMask &= ~leafMask; + } + } + } + + std::cout << "Prototype (Phase 1 verification):\n" + << " blocks = " << nBlocks << "\n" + << " voxels = " << nVoxels << "\n" + << " straddles = " << nStraddles << "\n" + << " laneChecks = " << stats.laneChecks << "\n"; + + if (stats.errors == 0) + std::cout << " PASSED\n"; + else + std::cerr << " FAILED: " << stats.errors << " mismatches\n"; +} + +// ============================================================ +// Entry point +// ============================================================ + +int main(int argc, char** argv) +{ + try { + int ambient_voxels = 1024 * 1024; // smaller default than the CUDA test + float occupancy = 0.5f; + + if (argc > 1) ambient_voxels = std::stoi(argv[1]); + if (argc > 2) occupancy = std::stof(argv[2]); + occupancy = std::max(0.0f, std::min(1.0f, occupancy)); + + std::cout << "ambient_voxels = " << ambient_voxels << "\n" + << "occupancy = " << occupancy << "\n"; + + auto coords = generateDomain(ambient_voxels, occupancy); + std::cout << "Active voxels generated: " << coords.size() << "\n"; + + // Build a float build grid from the coordinates. + nanovdb::tools::build::Grid buildGrid(0.f); + for (const auto& coord : coords) + buildGrid.tree().setValue(coord, 1.f); + + // Convert build::Grid → NanoGrid → NanoGrid. + // Two-step because createNanoGrid accepts NanoGrid + // as its source type (same path as ex_index_grid_cuda). + auto floatHandle = nanovdb::tools::createNanoGrid(buildGrid); + auto indexHandle = nanovdb::tools::createNanoGrid< + nanovdb::NanoGrid, + nanovdb::ValueOnIndex>( + *floatHandle.grid(), + 0u, // channels: no sidecar blind data + false, // includeStats + false); // includeTiles + auto* grid = indexHandle.grid(); + if (!grid) throw std::runtime_error("Failed to create ValueOnIndex grid"); + + const auto& tree = grid->tree(); + std::cout << "Leaves=" << tree.nodeCount(0) + << " Lower=" << tree.nodeCount(1) + << " Upper=" << tree.nodeCount(2) + << " Active=" << grid->activeVoxelCount() << "\n"; + + // Build VBM. + auto vbmHandle = nanovdb::tools::buildVoxelBlockManager(grid); + std::cout << "VBM blocks=" << vbmHandle.blockCount() + << " (BlockWidth=" << BlockWidth << ")\n\n"; + + runPrototype(grid, vbmHandle); + + } catch (const std::exception& e) { + std::cerr << "Exception: " << e.what() << "\n"; + return 1; + } + return 0; +} From f5041df559faf0169245a0f3cbfd1e46facadef4 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Mon, 13 Apr 2026 01:38:56 -0500 Subject: [PATCH 14/60] ex_stencil_gather_cpu: gather-site sentinel + AVX2 carry trick MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refactor computeNeededDirs to accept a pre-expanded Simd vector, moving sentinel/masking responsibility to the single gather site where leafMask is known: - kSentinelExpanded = expandVoxelOffset(292) = 0x41044104 (constexpr) - Caller broadcasts sentinel to all lanes, overwrites leafMask lanes with real expandVoxelOffset() values before calling computeNeededDirs - computeNeededDirs is now a pure add+reduce with no masking or cross-check Carry trick (§8e): expandVoxelOffset packs lz/lx/ly into 6 guarded 3-bit groups; a single vpaddd ymm × 2 + vpor + vpand + shuffle-tree detects all six WENO5 directions simultaneously. kExpandCarryK = 0x514530C3. AVX2 codegen confirmed via objdump: - computeNeededDirs: vpbroadcastd + 2×vpaddd ymm + vpor/vpand ymm + vextracti128/vpsrldq shuffle-tree, no branches or calls in hot path - activeMask/leafMask in runPrototype: vpcmpeqd ymm × 4 + vmovmskps ymm × 2 - Sentinel broadcast: 0x41044104 literal → vpbroadcastd → 2×vmovdqa ymm Always-on scalar cross-check at every computeNeededDirs call site. verifyComputeNeededDirsSentinel() tests both the sentinel carry property and the straddle-lane non-pollution scenario before runPrototype(). StencilGather.md §8e and §8f updated to match new API and codegen notes. Phase 1 prototype marked complete in §9. Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Efty Sifakis --- nanovdb/nanovdb/examples/CMakeLists.txt | 3 + .../stencil_gather_cpu.cpp | 362 ++++++++++++++++-- .../StencilGather.md | 163 ++++++-- simd_test/Simd.h | 45 +++ 4 files changed, 503 insertions(+), 70 deletions(-) diff --git a/nanovdb/nanovdb/examples/CMakeLists.txt b/nanovdb/nanovdb/examples/CMakeLists.txt index 33d1cab330..66f9365d78 100644 --- a/nanovdb/nanovdb/examples/CMakeLists.txt +++ b/nanovdb/nanovdb/examples/CMakeLists.txt @@ -126,6 +126,9 @@ endif() nanovdb_example(NAME "ex_stencil_gather_cpu") if(TARGET ex_stencil_gather_cpu) target_compile_options(ex_stencil_gather_cpu PRIVATE -mavx2 -fopenmp-simd) + # simd_test/Simd.h lives three levels above this CMakeLists (at the repo root). + target_include_directories(ex_stencil_gather_cpu PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/../../..) endif() if(CUDAToolkit_FOUND) diff --git a/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp b/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp index 0a898009b5..88b03a62b1 100644 --- a/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp +++ b/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp @@ -34,6 +34,7 @@ #include #include #include +#include // SimdMask, Simd, any_of, none_of, to_bitmask #include #include @@ -58,6 +59,10 @@ using LeafT = nanovdb::NanoLeaf; using CPUVBM = nanovdb::tools::VoxelBlockManager; using AccT = nanovdb::DefaultReadAccessor; +// Lane-predicate types: SIMDw-wide boolean mask and the uint32_t vector it compares. +using LeafIdxVec = nanovdb::util::Simd; +using LaneMask = nanovdb::util::SimdMask; + // Direction bit encoding shared across all stencil types: // bit(dx, dy, dz) = (dx+1)*9 + (dy+1)*3 + (dz+1), dx,dy,dz ∈ {-1,0,+1} // @@ -129,29 +134,302 @@ static inline int localAxisCoord(uint16_t vo, int axis) } // ============================================================ -// computeNeededDirs (§8e) +// Vectorized computeNeededDirs — shift-OR carry trick (§8e) +// +// Determines which of the 6 face-neighbor directions (±x, ±y, ±z) are required +// by any active lane in a SIMDw-wide batch. For WENO5 (R=3): +// plus-c neighbor needed iff any active lane has local coordinate lc ≥ 8−R = 5 +// minus-c neighbor needed iff any active lane has local coordinate lc ≤ R−1 = 2 +// +// Algorithm — "expand, add, reduce": +// 1. expandVoxelOffset(): pack lz, lx, ly (and second copies) into a 32-bit +// integer with 3-bit zero guards between groups. Each group occupies its +// own 3-bit field; the guards absorb carries so adjacent groups do not bleed. +// 2. Add kExpandCarryK to all SIMDw lane values simultaneously (one SIMD add). +// Groups 1–3 detect plus-directions; groups 4–6 detect minus-directions. +// 3. Horizontal OR across all lanes: carry bit SET → plus-direction needed. +// Horizontal AND across all lanes: carry bit CLEAR → minus-direction needed. +// (A minus-direction is needed when at least one lane has NO carry, i.e., +// lc < R; the AND bit is clear iff any lane failed to carry.) +// // ============================================================ -/// @brief Return a 27-bit probedMask bitmask of neighbor directions accessed by -/// any lane set in laneMask. For WENO5 (R=3) only the 6 face-direction bits -/// {4,10,12,14,16,22} can ever be set; the 21 edge/corner bits remain zero. -static uint32_t computeNeededDirs(const uint16_t* voxelOffset, - int batchStart, - uint32_t laneMask) +/// @brief voxelOffset sentinel for inactive / don't-care SIMD lanes. +/// +/// Any lane with laneMask[i] = false needs a voxelOffset value that: +/// • does NOT set carry bits 3, 9, 15 (would wrongly assert plus-directions), AND +/// • DOES set carry bits 19, 25, 31 (a clear bit would wrongly assert minus-directions). +/// +/// Local coordinate (4, 4, 4) satisfies both: R ≤ 4 < 8−R for R=3 (strictly interior). +/// voxelOffset(4,4,4) = 4*64 + 4*8 + 4 = 292 +/// group 1-3: 4 + 3 = 7 → no carry → bits 3, 9, 15 stay clear ✓ +/// group 4-6: 4 + 5 = 9 → carry → bits 19, 25, 31 stay set ✓ +/// +static constexpr uint16_t kInactiveVoxelOffset = (4u << 6) | (4u << 3) | 4u; // = 292 + +/// @brief Expand a 9-bit voxelOffset into a 32-bit "carry lane" layout. +/// +/// NanoVDB voxelOffset bit layout: [8:6] = lx, [5:3] = ly, [2:0] = lz. +/// +/// Target 32-bit layout — 6 groups of 3 bits with zero-guard separators: +/// +/// bits 0– 2 : lz ← group 1 (plus-z carry exits at bit 3) +/// bits 3– 5 : 0 (3-bit guard) +/// bits 6– 8 : lx ← group 2 (plus-x carry exits at bit 9) +/// bits 9–11 : 0 (3-bit guard) +/// bits 12–14 : ly ← group 3 (plus-y carry exits at bit 15) +/// bit 15 : 0 (1-bit guard — sufficient because max carry from a +/// 3-bit field added to a constant < 8 is exactly 1 bit; +/// at bit 15: input=0, addend=0, carry-in∈{0,1} → no further carry) +/// bits 16–18 : lz ← group 4 (minus-z carry exits at bit 19) +/// bits 19–21 : 0 (3-bit guard) +/// bits 22–24 : lx ← group 5 (minus-x carry exits at bit 25) +/// bits 25–27 : 0 (3-bit guard) +/// bits 28–30 : ly ← group 6 (minus-y carry exits at bit 31) +/// bit 31 : 0 (receives minus-y carry; bit 31 is within uint32_t range) +/// +/// Construction — three shift-OR steps, no multiply: +/// +/// Step 1: e |= (e << 9) → 0o xyzxyz (two 9-bit copies stacked, 18 bits) +/// Step 2: e &= 0x71C7 → keep lz@[0:2], lx@[6:8], ly@[12:14]; zero all others. +/// 0x71C7 = 0b 0111 0001 1100 0111 = 0o 070707 +/// Bits set: {0,1,2, 6,7,8, 12,13,14} +/// Step 3: e |= (e << 16) → copy the 15-bit pattern to bits [16:18],[22:24],[28:30] +/// +static inline constexpr uint32_t expandVoxelOffset(uint16_t vo) +{ + uint32_t e = vo; + e |= (e << 9); // step 1: two packed xyz copies at 9-bit stride + e &= 0x71C7u; // step 2: isolate lz@[0:2], lx@[6:8], ly@[12:14] with zero gaps + e |= (e << 16); // step 3: second copy to bits [16:18], [22:24], [28:30] + return e; +} + +/// @brief Combined carry-detection constant (added to expandVoxelOffset results). +/// +/// Groups 1–3 receive +R so a 3-bit field ≥ (8−R) produces a carry (plus-direction test). +/// Groups 4–6 receive +(8−R) so a 3-bit field ≥ R produces a carry (minus-direction test: +/// carry CLEAR ⟺ field < R ⟺ minus-direction needed). +/// +/// group1 group2 group3 group4 group5 group6 +/// K = R | R<<6 | R<<12 | (8-R)<<16 | (8-R)<<22 | (8-R)<<28 +/// = 3 | 192 | 12288 | 327680 | 20971520 | 1342177280 +/// = 1,363,488,963 (0x514530C3) — fits in uint32_t (< 2^32). +/// +/// Carry bits produced by expanded + K: +/// bit 3 set ↔ lz ≥ 8−R → plus-z needed +/// bit 9 set ↔ lx ≥ 8−R → plus-x needed +/// bit 15 set ↔ ly ≥ 8−R → plus-y needed +/// bit 19 clear ↔ lz < R → minus-z needed +/// bit 25 clear ↔ lx < R → minus-x needed +/// bit 31 clear ↔ ly < R → minus-y needed +/// +static constexpr uint32_t kExpandCarryK = + ((uint32_t)R ) | // bits 0– 2: +R → lz plus-z group + ((uint32_t)R << 6) | // bits 6– 8: +R → lx plus-x group + ((uint32_t)R << 12) | // bits 12–14: +R → ly plus-y group + ((uint32_t)(8-R) << 16) | // bits 16–18: +5 → lz minus-z group + ((uint32_t)(8-R) << 22) | // bits 22–24: +5 → lx minus-x group + ((uint32_t)(8-R) << 28); // bits 28–30: +5 → ly minus-y group + +/// @brief Pre-expanded sentinel value for inactive / straddle SIMD lanes. +/// +/// Caller broadcasts this to all lanes before overwriting the leafMask lanes +/// with the real expandVoxelOffset() values. Equivalent to +/// expandVoxelOffset(kInactiveVoxelOffset) +/// which, at compile time, is 0x41044104. +static constexpr uint32_t kSentinelExpanded = expandVoxelOffset(kInactiveVoxelOffset); + +/// @brief Scalar reference implementation (lane-by-lane loop). +/// Kept alongside the SIMD version so debug builds can cross-check. +static uint32_t computeNeededDirsScalar(const uint16_t* voxelOffset, + int batchStart, + LaneMask laneMask) { uint32_t needed = 0; for (int i = 0; i < SIMDw; i++) { - if (!(laneMask & (1u << i))) continue; + if (!laneMask[i]) continue; const uint16_t vo = voxelOffset[batchStart + i]; for (int axis = 0; axis < 3; axis++) { const int lc = localAxisCoord(vo, axis); - if (lc < R) needed |= (1u << kLoBit[axis]); - if (lc >= 8-R) needed |= (1u << kHiBit[axis]); + if (lc < R) needed |= (1u << kLoBit[axis]); + if (lc >= 8-R) needed |= (1u << kHiBit[axis]); } } return needed; } +/// @brief Vectorized computeNeededDirs — shift-OR carry trick. +/// +/// Returns the 27-bit probedMask subset identifying which of the 6 WENO5 +/// face-neighbor directions are required by any active lane. +/// +/// For WENO5 (R=3) only the 6 face-direction bits {4,10,12,14,16,22} can +/// ever be set; the 21 edge/corner bits remain zero. +/// +/// @param expandedVec SIMDw pre-expanded voxelOffset values (see expandVoxelOffset). +/// Caller is responsible for: +/// • Broadcasting kSentinelExpanded to all lanes first. +/// • Overwriting leafMask lanes with expandVoxelOffset(voxelOffset[...]). +/// This keeps sentinel / masking logic at the single gather site where leafMask +/// is known, not buried inside this function. +/// +/// High-level flow: +/// 1. Single SIMD add of kExpandCarryK (caller already expanded each lane). +/// 2. Horizontal OR of all results → carry SET = plus-direction needed. +/// Horizontal AND of all results → carry CLEAR = minus-direction needed. +/// 3. Map carry bits to the 27-bit probedMask encoding. +/// +static uint32_t computeNeededDirs(nanovdb::util::Simd expandedVec) +{ + using VecU32 = nanovdb::util::Simd; + + // --- Single SIMD add -------------------------------------------------- + // Inject carry-detection thresholds for all 6 groups simultaneously. + // After this add, each lane's result[i] encodes all six direction tests + // as carry bits at positions 3, 9, 15 (plus) and 19, 25, 31 (minus). + const VecU32 result = expandedVec + VecU32(kExpandCarryK); + + // --- Horizontal reductions -------------------------------------------- + // + // hor_or: bit k is set iff at least one lane has bit k set in result. + // → Check carry bits 3 (z), 9 (x), 15 (y): SET means plus-direction needed. + // + // hor_and: bit k is set iff every lane has bit k set in result. + // → Check carry bits 19 (z), 25 (x), 31 (y): CLEAR means minus-direction + // needed (at least one lane did not carry, i.e., its coordinate < R). + // + uint32_t hor_or = 0u, hor_and = ~0u; + for (int i = 0; i < SIMDw; i++) { + hor_or |= result[i]; + hor_and &= result[i]; + } + + // --- Map carry bits → probedMask direction bits ----------------------- + // + // Plus carries (bits 3, 9, 15) set → kHiBit (hi-side neighbor needed). + // Minus carries (bits 19, 25, 31) clear → kLoBit (lo-side neighbor needed). + // + // carry bit | axis | condition | probedMask bit + // ----------+------+------------+--------------- + // 3 | z | lz ≥ 8−R | kHiBit[2] = 14 + // 9 | x | lx ≥ 8−R | kHiBit[0] = 22 + // 15 | y | ly ≥ 8−R | kHiBit[1] = 16 + // 19 clr | z | lz < R | kLoBit[2] = 12 + // 25 clr | x | lx < R | kLoBit[0] = 4 + // 31 clr | y | ly < R | kLoBit[1] = 10 + // + uint32_t needed = 0; + if ( hor_or & (1u << 3)) needed |= (1u << kHiBit[2]); // plus-z + if ( hor_or & (1u << 9)) needed |= (1u << kHiBit[0]); // plus-x + if ( hor_or & (1u << 15)) needed |= (1u << kHiBit[1]); // plus-y + if (!(hor_and & (1u << 19))) needed |= (1u << kLoBit[2]); // minus-z + if (!(hor_and & (1u << 25))) needed |= (1u << kLoBit[0]); // minus-x + if (!(hor_and & (1u << 31))) needed |= (1u << kLoBit[1]); // minus-y + + return needed; +} + +// ============================================================ +// Targeted sentinel correctness test (§8e supplement) +// +// Verifies that inactive lanes — including straddle lanes that ARE active +// voxels but belong to a different leaf — do not inject spurious direction +// bits into the SIMD result. +// +// The test is designed so that a broken sentinel (i.e., using the straddle +// lane's real voxelOffset instead of kInactiveVoxelOffset) would produce a +// DIFFERENT result from the scalar reference in BOTH the plus and minus +// directions, making the bug impossible to miss. +// +// Layout (SIMDw = 16 lanes): +// leafMask lanes (even: 0,2,4,...,14): +// lx=4 (neutral for x), ly=4 (neutral for y), lz=6 (→ plus-z needed) +// voxelOffset = 4*64 + 4*8 + 6 = 294 +// +// straddle lanes (odd: 1,3,5,...,15) — active voxels, wrong leaf: +// lx=0 (→ minus-x if used), ly=7 (→ plus-y if used), lz=1 (→ minus-z if used) +// voxelOffset = 0*64 + 7*8 + 1 = 57 +// +// Expected result (scalar — straddle lanes ignored): +// plus-z needed (bit kHiBit[2]=14): lz=6 ≥ 5 in leafMask lanes ✓ +// minus-x NOT needed: lx=4 ≥ R=3 for all leafMask lanes ✓ +// plus-y NOT needed: ly=4 < 8-R=5 for all leafMask lanes ✓ +// minus-z NOT needed: lz=6 ≥ R=3 for all leafMask lanes ✓ +// plus-x NOT needed: lx=4 < 8-R=5 for all leafMask lanes ✓ +// minus-y NOT needed: ly=4 ≥ R=3 for all leafMask lanes ✓ +// +// If sentinel fails: straddle lx=0 → minus-x spuriously added; +// straddle ly=7 → plus-y spuriously added; +// straddle lz=1 → minus-z spuriously added. +// Those discrepancies are caught by the scalar cross-check inside +// computeNeededDirs, which will abort immediately. +// ============================================================ + +static void verifyComputeNeededDirsSentinel() +{ + // --- Sentinel property: expandVoxelOffset(292) + K must have --- + // --- plus-carry bits {3,9,15} clear and minus-carry bits {19,25,31} set --- + { + const uint32_t expanded = expandVoxelOffset(kInactiveVoxelOffset); + const uint32_t result = expanded + kExpandCarryK; + const bool plus_ok = !(result & ((1u<<3)|(1u<<9)|(1u<<15))); + const bool minus_ok = (result & ((1u<<19)|(1u<<25)|(1u<<31))) == + ((1u<<19)|(1u<<25)|(1u<<31)); + if (!plus_ok || !minus_ok) { + std::cerr << "verifyComputeNeededDirsSentinel: sentinel carry property violated" + << " expanded=0x" << std::hex << expanded + << " result=0x" << result << std::dec << "\n"; + std::abort(); + } + } + + // --- Straddle scenario: straddle lanes must not pollute the result --- + alignas(64) uint16_t voxelOffset[BlockWidth] = {}; + + // leafMask lanes (even): lx=4, ly=4, lz=6 → voxelOffset = 4*64+4*8+6 = 294 + // straddle lanes (odd): lx=0, ly=7, lz=1 → voxelOffset = 0*64+7*8+1 = 57 + LaneMask laneMask; + for (int i = 0; i < SIMDw; i++) { + const bool active = (i % 2 == 0); + laneMask[i] = active; + voxelOffset[i] = active ? uint16_t(294) : uint16_t(57); + } + + // Expected: only plus-z (kHiBit[2] = 14) should be set. + // + // Build the pre-expanded vector exactly as the gather site would. + using VecU32 = nanovdb::util::Simd; + VecU32 expandedVec(kSentinelExpanded); + for (int i = 0; i < SIMDw; i++) { + if (laneMask[i]) expandedVec[i] = expandVoxelOffset(voxelOffset[i]); + } + const uint32_t result = computeNeededDirs(expandedVec); + + // Explicit cross-check: scalar reference (SIMD cross-check no longer lives inside + // computeNeededDirs — it is the caller's responsibility at each gather site). + { + const uint32_t ref = computeNeededDirsScalar(voxelOffset, 0, laneMask); + if (result != ref) { + std::cerr << "verifyComputeNeededDirsSentinel: SIMD/scalar mismatch" + << " simd=0x" << std::hex << result + << " ref=0x" << ref << std::dec << "\n"; + std::abort(); + } + } + + const uint32_t expected = (1u << kHiBit[2]); // plus-z only + + if (result != expected) { + std::cerr << "verifyComputeNeededDirsSentinel: wrong direction mask" + << " got=0x" << std::hex << result + << " expected=0x" << expected << std::dec << "\n"; + std::abort(); + } + + std::cout << "verifyComputeNeededDirsSentinel: PASSED\n"; +} + // ============================================================ // Verification // ============================================================ @@ -179,12 +457,12 @@ static void verifyBatchPtrs( const uint32_t* leafIndex, const uint16_t* voxelOffset, int batchStart, - uint32_t laneMask, + LaneMask laneMask, AccT& refAcc, VerifyStats& stats) { for (int i = 0; i < SIMDw; i++) { - if (!(laneMask & (1u << i))) continue; + if (!laneMask[i]) continue; const int p = batchStart + i; const LeafT* centerLeaf = &firstLeaf[leafIndex[p]]; @@ -284,34 +562,27 @@ static void runPrototype(const GridT* // Process SIMD batches. for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { - // Build active-lane mask: positions with a valid (non-sentinel) leafIndex. - uint32_t activeMask = 0; - for (int i = 0; i < SIMDw; i++) { - if (leafIndex[batchStart + i] != CPUVBM::UnusedLeafIndex) - activeMask |= (1u << i); - } - if (!activeMask) continue; + // Load the SIMDw leafIndex values for this batch once; reused below. + const LeafIdxVec leafSlice(&leafIndex[batchStart], nanovdb::util::element_aligned); + + // Active-lane mask: lanes with a valid (non-sentinel) leafIndex. + LaneMask activeMask = (leafSlice != LeafIdxVec(CPUVBM::UnusedLeafIndex)); + if (nanovdb::util::none_of(activeMask)) continue; // Track straddle batches for diagnostic output. for (int i = 0; i < SIMDw; i++) { - if ((activeMask & (1u << i)) && - leafIndex[batchStart + i] != currentLeafID) { + if (activeMask[i] && leafIndex[batchStart + i] != currentLeafID) { nStraddles++; break; } } // Inner loop: consume one center leaf's worth of lanes per iteration. - while (activeMask) { + while (nanovdb::util::any_of(activeMask)) { // Identify lanes belonging to currentLeafID. - uint32_t leafMask = 0; - for (int i = 0; i < SIMDw; i++) { - if ((activeMask & (1u << i)) && - leafIndex[batchStart + i] == currentLeafID) - leafMask |= (1u << i); - } + LaneMask leafMask = activeMask & (leafSlice == LeafIdxVec(currentLeafID)); - if (!leafMask) { + if (nanovdb::util::none_of(leafMask)) { // No lanes for currentLeafID: advance to next leaf. assert(currentLeafID < firstLeafID[bID] + (uint32_t)nExtraLeaves); currentLeafID++; @@ -321,7 +592,31 @@ static void runPrototype(const GridT* } // --- Phase 1: probe newly needed neighbor leaves (§8d) --- - const uint32_t neededMask = computeNeededDirs(voxelOffset, batchStart, leafMask); + // + // Build the pre-expanded vector at the gather site — the only + // place where leafMask is known. Broadcast the sentinel first + // (straddle / inactive lanes stay neutral), then overwrite the + // leafMask lanes with their actual expandVoxelOffset values. + using VecU32 = nanovdb::util::Simd; + VecU32 expandedVec(kSentinelExpanded); + for (int i = 0; i < SIMDw; i++) { + if (leafMask[i]) + expandedVec[i] = expandVoxelOffset(voxelOffset[batchStart + i]); + } + const uint32_t neededMask = computeNeededDirs(expandedVec); + + // Cross-check against scalar reference (always-on; overhead is + // ~18 scalar ops per batch, negligible vs. the probeLeaf calls). + { + const uint32_t ref = computeNeededDirsScalar(voxelOffset, batchStart, leafMask); + if (neededMask != ref) { + std::cerr << "computeNeededDirs: SIMD/scalar mismatch" + << " simd=0x" << std::hex << neededMask + << " ref=0x" << ref << std::dec << "\n"; + std::abort(); + } + } + uint32_t toProbe = neededMask & ~probedMask; while (toProbe) { @@ -338,7 +633,7 @@ static void runPrototype(const GridT* // batchPtrs[3][i] = z-axis neighbor const LeafT* batchPtrs[4][SIMDw] = {}; for (int i = 0; i < SIMDw; i++) { - if (!(leafMask & (1u << i))) continue; + if (!leafMask[i]) continue; batchPtrs[0][i] = &firstLeaf[currentLeafID]; for (int axis = 0; axis < 3; axis++) { const int lc = localAxisCoord(voxelOffset[batchStart + i], axis); @@ -354,7 +649,7 @@ static void runPrototype(const GridT* verifyBatchPtrs(batchPtrs, firstLeaf, leafIndex, voxelOffset, batchStart, leafMask, acc, stats); - activeMask &= ~leafMask; + activeMask = activeMask & !leafMask; } } } @@ -378,6 +673,9 @@ static void runPrototype(const GridT* int main(int argc, char** argv) { try { + // Targeted sentinel test runs unconditionally before any VBM data is needed. + verifyComputeNeededDirsSentinel(); + int ambient_voxels = 1024 * 1024; // smaller default than the CUDA test float occupancy = 0.5f; diff --git a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilGather.md b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilGather.md index 9ffca20e10..f8795353d4 100644 --- a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilGather.md +++ b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilGather.md @@ -471,43 +471,124 @@ batches in that leaf, always ≤ 26 (≤ 6 for WENO5). same leaf repeatedly. One accessor per thread, reused across the entire block loop, accumulates leaf-cache hits throughout the computation. -### 8e. `computeNeededDirs` — Stencil-Specific Batch Probe Mask +### 8e. `computeNeededDirs` — Shift-OR Carry Trick ```cpp -uint32_t computeNeededDirs(const uint16_t* voxelOffset, uint32_t laneMask); +// Caller builds the pre-expanded vector at the gather site: +// 1. broadcast kSentinelExpanded to all SIMDw lanes (inactive/straddle lanes stay neutral) +// 2. overwrite leafMask lanes with expandVoxelOffset(voxelOffset[b+i]) +// Then call: +uint32_t computeNeededDirs(Simd expandedVec); ``` -Inspects the `voxelOffset` values of active lanes and returns a bitmask of directions -whose neighbor leaf is accessed by at least one lane. This is purely arithmetic on -voxelOffsets — no tree access, no probeLeaf. - -Direction bits use the shared 3×3×3 encoding from §6a: -`bit(dx,dy,dz) = (dx+1)*9 + (dy+1)*3 + (dz+1)`. +Returns a bitmask of directions whose neighbor leaf is required by at least one active +lane. Purely arithmetic — no tree access, no probeLeaf. Direction bits use the §6a +encoding: `bit(dx,dy,dz) = (dx+1)*9 + (dy+1)*3 + (dz+1)`. **Direction encoding (WENO5, 6 active bits out of 27):** | Bit | Direction | (dx,dy,dz) | Condition (per lane) | |-----|-----------|-----------|-----------------------------------| -| 4 | x-lo | (-1,0,0) | `(vo >> 6) < R` | -| 10 | y-lo | (0,-1,0) | `((vo >> 3) & 0x7) < R` | -| 12 | z-lo | (0,0,-1) | `(vo & 0x7) < R` | -| 14 | z-hi | (0,0,+1) | `(vo & 0x7) >= (8 - R)` | -| 16 | y-hi | (0,+1,0) | `((vo >> 3) & 0x7) >= (8 - R)` | -| 22 | x-hi | (+1,0,0) | `(vo >> 6) >= (8 - R)` | - -For WENO5 (R=3): z-lo when lz ∈ {0,1,2}, z-hi when lz ∈ {5,6,7}. Each condition -is a threshold comparison across SIMDw lanes, folded with an `any()` reduction. -The remaining 21 bits of `neededMask` are always zero for WENO5. - -**Box stencil (R=1, up to 26 active bits):** face directions use the same six bit -positions with thresholds 0 and 7. Edge directions (e.g. (-1,-1,0) → bit 1) -require a pairwise AND: `any(lx == 0 && ly == 0)`. Corner directions require all -three simultaneously. Same mechanism throughout — all pure arithmetic on -voxelOffsets, same `uint32_t` mask type. +| 4 | x-lo | (-1,0,0) | `lx < R` | +| 10 | y-lo | (0,-1,0) | `ly < R` | +| 12 | z-lo | (0,0,-1) | `lz < R` | +| 14 | z-hi | (0,0,+1) | `lz >= (8 - R)` | +| 16 | y-hi | (0,+1,0) | `ly >= (8 - R)` | +| 22 | x-hi | (+1,0,0) | `lx >= (8 - R)` | + +where `lx = vo >> 6`, `ly = (vo >> 3) & 7`, `lz = vo & 7`. + +**Algorithm — "expand, add, reduce" (single SIMD add for all 6 directions):** + +`expandVoxelOffset(vo)` packs lz, lx, ly into a 32-bit integer with 3-bit zero-guard +separators so that one carry-bit addition simultaneously tests all six directions. +Three shift-OR steps; no multiply: + +``` +e = vo +e |= (e << 9) // two packed xyz copies at 9-bit stride +e &= 0x71C7 // 0b0111_0001_1100_0111 — isolate lz@[0:2], lx@[6:8], ly@[12:14] +e |= (e << 16) // duplicate lower 15 bits to [16:30] +``` + +Target layout after expansion: + +``` +bits 0– 2 : lz (group 1 — plus-z carry exits at bit 3) +bits 3– 5 : 0 (3-bit guard) +bits 6– 8 : lx (group 2 — plus-x carry exits at bit 9) +bits 9–11 : 0 +bits 12–14 : ly (group 3 — plus-y carry exits at bit 15) +bit 15 : 0 (1-bit guard — sufficient: max carry from 3-bit + constant < 8 is 1 bit) +bits 16–18 : lz (group 4 — minus-z carry exits at bit 19) +bits 19–21 : 0 +bits 22–24 : lx (group 5 — minus-x carry exits at bit 25) +bits 25–27 : 0 +bits 28–30 : ly (group 6 — minus-y carry exits at bit 31) +``` + +`kExpandCarryK` encodes the detection threshold for all six groups in one `uint32_t` +(= 0x514530C3): + +``` +K = R | R<<6 | R<<12 | (8-R)<<16 | (8-R)<<22 | (8-R)<<28 + = 0x514530C3 (for R = 3) +``` + +Groups 1–3 receive `+R`: a field ≥ (8−R) carries (plus-direction needed). +Groups 4–6 receive `+(8−R)`: a field ≥ R carries (a CLEAR carry means minus-direction +needed — at least one lane had lc < R). + +After `result = expandedVec + kExpandCarryK` (one `vpaddd ymm` × 2): + +``` +hor_or = OR of all lanes → bit k SET ↔ plus-direction k needed (any lane) +hor_and = AND of all lanes → bit k CLEAR ↔ minus-direction k needed (any lane) + +if (hor_or & (1 << 3)) neededMask |= (1 << kHiBit[z]); // bit 14 +if (hor_or & (1 << 9)) neededMask |= (1 << kHiBit[x]); // bit 22 +if (hor_or & (1 << 15)) neededMask |= (1 << kHiBit[y]); // bit 16 +if (!(hor_and & (1 << 19))) neededMask |= (1 << kLoBit[z]); // bit 12 +if (!(hor_and & (1 << 25))) neededMask |= (1 << kLoBit[x]); // bit 4 +if (!(hor_and & (1 << 31))) neededMask |= (1 << kLoBit[y]); // bit 10 +``` + +**Sentinel for inactive/straddle lanes:** Local coordinate (4,4,4) maps to +`kInactiveVoxelOffset = 292`. Its pre-expanded form `kSentinelExpanded = 0x41044104` +satisfies: groups 1–3 sum to 4+3=7 (no carry → plus bits stay clear), groups 4–6 sum +to 4+5=9 (carry → minus bits stay set). The sentinel is broadcast at the **gather +site** — the only place where `leafMask` is known — before overwriting the active +lanes. This keeps sentinel responsibility out of `computeNeededDirs` itself. + +**Codegen (AVX2, `ex_stencil_gather_cpu`, -O3 -mavx2):** + +`computeNeededDirs` compiles to a non-inlined function of ~80 bytes with no branches +or function calls in the carry path: + +```asm +vpbroadcastd xmm0→ymm0 ; broadcast kExpandCarryK (0x514530c3) +vpaddd ymm0, [rdi], ymm1 ; add to lanes 0–7 +vpaddd ymm0, [rdi+32],ymm0 ; add to lanes 8–15 +vpor ymm1, ymm0, ymm2 ; hor_or intermediate (8 lanes) +vpand ymm1, ymm0, ymm1 ; hor_and intermediate (8 lanes) +; shuffle-tree 8→4→2→1 via vextracti128 / vpand / vpor / vpsrldq (×2) +; scalar carry-bit → neededMask decode via shl/and/test/cmov (branchless) +vzeroupper; ret +``` + +**Codegen for the gather-site loop (within `runPrototype`/`main`):** + +- `activeMask = (leafSlice != UnusedLeafIndex)`: `vpcmpeqd ymm × 4` + `vmovmskps ymm × 2` — fully vectorized. +- `leafMask = activeMask & (leafSlice == currentLeafID)`: `vpbroadcastd` + `vpcmpeqd ymm × 2` + `vmovmskps ymm × 2` + scalar AND — fully vectorized. +- Sentinel broadcast: `0x41044104` literal → `vpbroadcastd ymm` × 2 stores filling all 64 bytes. +- `expandVoxelOffset` scatter (per leafMask lane): scalar — 5 ops inlined per lane, gated by bit tests on the 16-bit bitmask. Not vectorizable due to `if (leafMask[i])` branch; dominated by downstream `probeLeaf` calls anyway. + +**Box stencil (R=1, up to 26 active bits):** face directions use the same algorithm +with different thresholds; edge and corner directions require AND of pairwise/triple +conditions and are left for future work. `computeNeededDirs` is the only function that encodes knowledge of the stencil's -reach R and how offsets map to neighbor leaves. It is written once per stencil -shape and is small (≤ 20 SIMD instructions for WENO5). +reach R and direction-to-offset mapping. Written once per stencil shape. ### 8f. CPU Block-Level Loop Structure @@ -554,8 +635,12 @@ for (int b = 0; b < BlockWidth; b += SIMDw) { continue; } - // Probe any newly needed neighbors - uint32_t neededMask = computeNeededDirs(voxelOffset + b, leafMask); + // Build pre-expanded vector at the gather site (only place leafMask is known). + // Broadcast sentinel; overwrite active lanes with real expandVoxelOffset(). + VecU32 expandedVec(kSentinelExpanded); + for (int i = 0; i < SIMDw; i++) + if (leafMask & (1 << i)) expandedVec[i] = expandVoxelOffset(voxelOffset[b+i]); + uint32_t neededMask = computeNeededDirs(expandedVec); uint32_t toProbe = neededMask & ~probedMask; while (toProbe) { int d = __builtin_ctz(toProbe); @@ -619,16 +704,18 @@ with output into a SoA `stencilData[N][SIMDw]` array. Auto-vectorization strate `decodeInverseMaps` returns (popcount loop, same as the internal loop bound). `decodeInverseMaps` API is not modified — avoids CPU/GPU asymmetry. -- **Prototype — immediate next step**: `stencil_gather_cpu.cpp` in - `ex_voxelBlockManager_host_cuda/`. Scope: - - Generate domain with `generateDomain` (reuse from `vbm_host_cuda.cpp`). - - Build VBM. Iterate over blocks; call `decodeInverseMaps` per block. - - For each batch: run the full §8d probeLeaf + `batchPtrs[4][SIMDw]` population. - - Verification only (no `computeStencil`): for each active lane, walk all 19 - WENO5 offsets, check that `batchPtrs[axis][i]` matches a direct `probeLeaf` - reference for every neighbor that crosses a leaf boundary. - - Use WENO5 stencil directly (not the simpler 7-pt Laplacian — WENO5 exercises - R=3 boundary conditions and all six face directions). +- **Prototype — DONE** (`ex_stencil_gather_cpu/stencil_gather_cpu.cpp`): + Phase 1 (neighbor leaf resolution) fully implemented and verified: + - `generateDomain` + VBM build + `decodeInverseMaps` per block. + - Full §8d probeLeaf + `batchPtrs[4][SIMDw]` population with lazy `probedMask`. + - `computeNeededDirs` with shift-OR carry trick (§8e) and gather-site sentinel. + - Always-on scalar cross-check at every `computeNeededDirs` call site. + - `verifyComputeNeededDirsSentinel()`: dedicated straddle-lane sentinel unit test. + - `verifyBatchPtrs()`: end-to-end per-lane batchPtrs check against direct `probeLeaf`. + - AVX2 codegen confirmed via `objdump` for `computeNeededDirs` and all mask + operations in the outer/inner loops (see §8e Codegen notes). + - Next: implement `computeStencil` (Phase 2 — index gather) and the scalar + cross-check launcher. - **Generalizing beyond R ≤ 3**: the single-neighbor-per-axis assumption is baked into the current design. Any stencil with R > 4 would require revisiting §5a and §6. diff --git a/simd_test/Simd.h b/simd_test/Simd.h index 9470ab9b7c..407ce2a562 100644 --- a/simd_test/Simd.h +++ b/simd_test/Simd.h @@ -82,6 +82,13 @@ inline Simd where(SimdMask mask, Simd a, Simd b) { return result; } +template +inline bool any_of(SimdMask m) { return stdx::any_of(m); } +template +inline bool none_of(SimdMask m) { return stdx::none_of(m); } +template +inline bool all_of(SimdMask m) { return stdx::all_of(m); } + // =========================================================================== // Implementation B: std::array backend (default) // =========================================================================== @@ -92,6 +99,15 @@ struct SimdMask { std::array data{}; NANOVDB_SIMD_HOSTDEV bool operator[](int i) const { return data[i]; } NANOVDB_SIMD_HOSTDEV bool& operator[](int i) { return data[i]; } + NANOVDB_SIMD_HOSTDEV SimdMask operator!() const { + SimdMask r; for (int i = 0; i < W; i++) r.data[i] = !data[i]; return r; + } + NANOVDB_SIMD_HOSTDEV SimdMask operator&(SimdMask o) const { + SimdMask r; for (int i = 0; i < W; i++) r.data[i] = data[i] && o.data[i]; return r; + } + NANOVDB_SIMD_HOSTDEV SimdMask operator|(SimdMask o) const { + SimdMask r; for (int i = 0; i < W; i++) r.data[i] = data[i] || o.data[i]; return r; + } }; template @@ -128,6 +144,12 @@ struct Simd { for (int i = 0; i < W; i++) m.data[i] = data[i] > o.data[i]; return m; } + NANOVDB_SIMD_HOSTDEV SimdMask operator==(Simd o) const { + SimdMask m; for (int i = 0; i < W; i++) m.data[i] = data[i] == o.data[i]; return m; + } + NANOVDB_SIMD_HOSTDEV SimdMask operator!=(Simd o) const { + SimdMask m; for (int i = 0; i < W; i++) m.data[i] = data[i] != o.data[i]; return m; + } }; template NANOVDB_SIMD_HOSTDEV @@ -160,8 +182,31 @@ NANOVDB_SIMD_HOSTDEV Simd where(SimdMask mask, Simd a, Simd Simd r; for (int i = 0; i < W; i++) r[i] = mask[i] ? a[i] : b[i]; return r; } +template +NANOVDB_SIMD_HOSTDEV bool any_of(SimdMask m) { + bool r = false; for (int i = 0; i < W; i++) r |= m[i]; return r; +} +template +NANOVDB_SIMD_HOSTDEV bool none_of(SimdMask m) { return !any_of(m); } +template +NANOVDB_SIMD_HOSTDEV bool all_of(SimdMask m) { + bool r = true; for (int i = 0; i < W; i++) r &= m[i]; return r; +} + #endif // NANOVDB_USE_STD_SIMD +// --------------------------------------------------------------------------- +// to_bitmask — fold SimdMask into a uint32_t (one bit per lane). +// T is the associated element type; only W matters. Requires W <= 32. +// --------------------------------------------------------------------------- +template +NANOVDB_SIMD_HOSTDEV uint32_t to_bitmask(SimdMask m) { + static_assert(W <= 32, "to_bitmask: W must be <= 32"); + uint32_t r = 0; + for (int i = 0; i < W; i++) if (m[i]) r |= (1u << i); + return r; +} + // --------------------------------------------------------------------------- // Scalar overloads — always present, for T=float (GPU / scalar path) // --------------------------------------------------------------------------- From bf115e22ad12911796ec5e7a2c24577c4b32dd8d Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Mon, 13 Apr 2026 02:20:31 -0500 Subject: [PATCH 15/60] BatchAccessor.md: design for SIMD batch leaf-neighborhood cache MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Documents the BatchAccessor — the SIMD-batch analog of ValueAccessor — developed from the ex_stencil_gather_cpu Phase 1 prototype discussion. Core concept: instead of caching the path to one leaf, cache the full 3×3×3 neighborhood of 27 leaf pointers around the current center leaf, serving SIMDw voxels per call. Key design elements documented: Eviction policy: fires on none_of(leafMask) only — straddle lanes do not evict. leafMask is the "partial-hit" signal with no scalar-accessor analog. Prefetch coverage argument: - WENO5 (R=3): 6 extremal taps (±R,0,0),(0,±R,0),(0,0,±R) are necessary and sufficient — equivalent to the computeNeededDirs carry trick - Box stencil (R=1): 8 corner taps (±1,±1,±1) collectively cover all 26 non-center directions for any voxel position in the batch Three-tier API: - prefetch(vo, leafMask, treeAcc) - cachedGetValue(vo, leafMask) — no treeAcc, cache assumed warm - getValue(vo, leafMask, treeAcc) — lazy combined (vanilla style) Template rationale vs runtime Coord: compile-time direction bit, dead-axis elimination, VDB convention alignment; runtime Coord overload provided for generic stencil adapters. AVX2 profile: offset arithmetic (vpaddd ymm), lane split (vpcmpgtd ymm), gather from ≤2 leaf pointers (vgatherdps×2 + vpblendvb) — both scalar bottlenecks from Phase 1 prototype are eliminated. StencilGather.md: add cross-reference to BatchAccessor.md. Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Efty Sifakis --- .../BatchAccessor.md | 432 ++++++++++++++++++ .../StencilGather.md | 6 + 2 files changed, 438 insertions(+) create mode 100644 nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.md diff --git a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.md b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.md new file mode 100644 index 0000000000..5331fbfe48 --- /dev/null +++ b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.md @@ -0,0 +1,432 @@ +# BatchAccessor — SIMD Batch Leaf-Neighborhood Cache + +This document is the design reference for `BatchAccessor`, the SIMD-batch analog +of NanoVDB's `ValueAccessor`. It captures the full design rationale developed +alongside the `ex_stencil_gather_cpu` Phase 1 prototype. + +--- + +## 1. Motivation and Core Analogy + +NanoVDB's `DefaultReadAccessor` amortizes the cost of root-to-leaf tree traversal +by caching the path for a single voxel. When successive scalar `getValue(ijk)` calls +land in the same leaf, only the first call pays the full traversal; subsequent calls +hit the cached leaf pointer in ~6 integer instructions. + +`BatchAccessor` lifts this idea one level: instead of caching the path to one leaf, +it caches the **3×3×3 neighborhood of leaf pointers** surrounding the current center +leaf. Instead of serving one voxel per call, it serves a **SIMD batch of SIMDw +voxels** simultaneously. + +| Property | Scalar `ValueAccessor` | `BatchAccessor` | +|----------|------------------------|-----------------| +| Cache unit | Path root→leaf (3 node ptrs) | 27 neighbor leaf ptrs | +| Granularity | 1 voxel per call | SIMDw voxels per call | +| Cache key | Voxel coordinate in cached leaf's bbox | `currentLeafID` (VBM ordering) | +| "Hit" condition | Next voxel in same leaf | `mProbedMask` covers needed direction | +| Eviction trigger | Implicit on any miss | Explicit: `none_of(leafMask)` | +| Guarantee of hit rate | Access-pattern dependent | Structural (VBM Morton ordering) | + +The hit rate of the scalar accessor depends on the access pattern. `BatchAccessor`'s +amortization is **structural**: the VBM groups voxels by leaf, so within any batch, +the center leaf is known in advance, and directions probed for batch k remain valid +for all subsequent batches in the same center leaf. + +--- + +## 2. Cache State + +Four pieces of state persist across batches within one center leaf: + +```cpp +template +class BatchAccessor { + uint32_t mProbedMask = 0; // bit d set ↔ direction d has been probed + const LeafT* mPtrs[27] = {}; // canonical neighbor table; mPtrs[13] = center + uint32_t mCurrentLeafID; // index of current center leaf + nanovdb::Coord mCenterLeafCoord; // origin of current center leaf + // (plus a reference to the underlying grid for probeLeaf calls) +}; +``` + +`mPtrs[27]` uses the shared 3×3×3 direction encoding from `StencilGather.md §6a`: + +``` +bit(dx, dy, dz) = (dx+1)*9 + (dy+1)*3 + (dz+1) dx,dy,dz ∈ {-1, 0, +1} +``` + +`mPtrs[13]` (the center, `bit(0,0,0)`) always points to +`&tree.getFirstNode<0>()[mCurrentLeafID]`. The 26 non-center entries are populated +lazily by `prefetch` calls. + +**Cache advance:** when `mCurrentLeafID` changes: + +```cpp +void advance(uint32_t newLeafID) { + mCurrentLeafID = newLeafID; + mProbedMask = 0; // stale neighbor ptrs; force re-probe before use + mCenterLeafCoord = tree.getFirstNode<0>()[newLeafID].origin(); + // mPtrs[] entries are stale but harmless; mProbedMask=0 prevents their use +} +``` + +--- + +## 3. Eviction and the `leafMask` — The Straddle Problem + +This is the key structural difference from the scalar accessor. + +In the scalar case, "cache miss" and "eviction" are the same event — the single voxel +is either in the cached leaf or it isn't. In the batch case they decouple: + +- **Straddle lanes**: active voxels in the batch that belong to a *later* leaf + (`leafIndex[i] != currentLeafID`, `leafMask[i] = false`). The cache is still valid + for the remaining current-leaf lanes. No eviction. +- **Eviction**: `none_of(leafMask)` — no lane in this batch belongs to the current + leaf. Only then does `advance()` fire. + +`leafMask` is therefore the accessor's **partial-hit signal** — a concept that has +no scalar analog. Without it, the accessor would evict prematurely on every straddle +batch, losing the cross-batch amortization that makes `mProbedMask` valuable. + +The straddle lane problem is solved at the call site by masking: straddle lanes receive +a sentinel voxelOffset value (`kSentinelExpanded = expandVoxelOffset(292)`, local +coordinate (4,4,4)) that produces no false direction bits in either the plus-OR or +minus-AND reduction. This is already implemented and verified in the Phase 1 +prototype (`ex_stencil_gather_cpu`). + +--- + +## 4. The Prefetch Insight — Extremal Taps as a Neighborhood Census + +The naive "vanilla accessor" approach would issue a `probeLeaf` call on first access +for each stencil tap, lazily. The `BatchAccessor` exploits **domain-specific +knowledge of the stencil geometry** to warm the cache with a minimal set of +strategically chosen taps — the *extremal* taps — that together constitute a complete +census of the neighborhood. + +### 4a. WENO5 (Axis-Aligned, Reach R=3) — 6 Extremal Taps + +For an axis-aligned stencil, only one axis can cross a leaf boundary per tap. The +condition for needing the x+ neighbor leaf is: + +``` +∃ delta ∈ {1..R} s.t. lx + delta ≥ 8 ↔ lx ≥ 8 − R +``` + +The extremal tap at `+R` detects exactly `lx + R ≥ 8 ↔ lx ≥ 8 − R` — which is the +**necessary and sufficient condition** for needing x+ at all. Any smaller delta for +the same voxel would probe the same x+ leaf if it crosses, or not cross at all. + +Therefore, prefetching the 6 extremal taps covers all directions needed by any +intermediate tap: + +``` +prefetch<+R, 0, 0>, prefetch<-R, 0, 0> → x+ / x- face leaves +prefetch< 0,+R, 0>, prefetch< 0,-R, 0> → y+ / y- face leaves +prefetch< 0, 0,+R>, prefetch< 0, 0,-R> → z+ / z- face leaves +``` + +For WENO5 with R=3: **6 probeLeaf calls maximum** per center leaf, covering all +19 stencil taps. This is identical to what `computeNeededDirs` computes (the carry +trick encodes all 6 thresholds simultaneously). + +### 4b. 3×3×3 Box Stencil (R=1) — 8 Corner Taps + +For the box stencil, a stencil tap at `(lx+dx, ly+dy, lz+dz)` where `dx,dy,dz ∈ +{-1,0,+1}` can cross one, two, or three axes simultaneously (face, edge, or corner +neighbor leaf respectively). + +**Claim**: the 8 corner taps `(±1, ±1, ±1)` collectively cover all 26 non-center +neighbor directions for any voxel position in the batch. + +**Coverage argument**: For any voxel `(lx, ly, lz)` and any direction +`(dx, dy, dz)` that the stencil actually needs (i.e., some coordinate crosses a leaf +boundary), there exists a corner tap `(sx, sy, sz)` with `sx, sy, sz ∈ {-1, +1}` +such that when applied to this voxel it probes the **same neighbor leaf**. + +Concretely, the corner tap `(-1,-1,+1)` applied to voxel `(0, 0, 4)` accesses +`(-1, -1, 5)`, which falls in the `(x−, y−)` edge leaf — the same leaf needed by +the edge tap `(-1, -1, 0)` for this voxel. The corner tap `(-1,+1,-1)` for the +same voxel accesses `(-1, 1, 3)`, falling in the `x−` face leaf — the same leaf +needed by `(-1, 0, 0)`. + +Each corner tap, applied to varying voxel positions in the batch, will probe face, +edge, or corner leaves depending on how many axes actually cross — collectively +exhausting all 26 directions across the batch. + +**At most 8 probeLeaf calls** per center leaf for the full 27-point box stencil +(in practice fewer, since many corner taps land in the center leaf for interior +voxels, and `mProbedMask` prevents re-probing the same direction twice). + +--- + +## 5. API — Three Tiers + +### 5a. Core Functions + +```cpp +// ── Tier 1a: warm the cache for a specific stencil offset ────────────────── +// For each active (leafMask) lane: compute which neighbor leaf the tap +// (di,dj,dk) falls in, probe it into mPtrs[] if not already in mProbedMask. +// Takes treeAcc — may call probeLeaf. +template +void prefetch(Simd vo, LaneMask leafMask, AccT& treeAcc); + +// ── Tier 1b: read from cache (cache assumed warm) ────────────────────────── +// For each active lane: compute local offset within the cached neighbor leaf, +// fetch and return the value (or index for ValueOnIndex grids). +// Does NOT take treeAcc — guaranteed not to touch the tree. +// Debug builds assert mProbedMask covers the needed direction. +template +Simd cachedGetValue(Simd vo, LaneMask leafMask) const; + +// ── Tier 2: lazy combined operation (vanilla accessor style) ─────────────── +// Equivalent to prefetch + cachedGetValue. +// Correct without explicit prefetch management; slightly suboptimal for +// repeated calls in the same center leaf (redundant bitmask checks). +template +Simd getValue(Simd vo, LaneMask leafMask, AccT& treeAcc); +``` + +The presence or absence of `treeAcc` in the signature is self-documenting: +`cachedGetValue` is the only function that can be called in a "no tree access" +context, and the compiler enforces that it doesn't get one. + +### 5b. Usage Patterns + +**Tier 1 — production path** (explicit prefetch, recommended for performance-critical +stencil kernels): + +```cpp +// Warm the cache with the 6 WENO5 extremal taps +batchAcc.prefetch<-3, 0, 0>(vo, leafMask, treeAcc); +batchAcc.prefetch<+3, 0, 0>(vo, leafMask, treeAcc); +batchAcc.prefetch< 0,-3, 0>(vo, leafMask, treeAcc); +batchAcc.prefetch< 0,+3, 0>(vo, leafMask, treeAcc); +batchAcc.prefetch< 0, 0,-3>(vo, leafMask, treeAcc); +batchAcc.prefetch< 0, 0,+3>(vo, leafMask, treeAcc); + +// All cachedGetValue calls are pure arithmetic + gather — no tree access +auto u_m3 = batchAcc.cachedGetValue<-3, 0, 0>(vo, leafMask); +auto u_m2 = batchAcc.cachedGetValue<-2, 0, 0>(vo, leafMask); +auto u_m1 = batchAcc.cachedGetValue<-1, 0, 0>(vo, leafMask); +auto u_0 = batchAcc.cachedGetValue< 0, 0, 0>(vo, leafMask); +auto u_p1 = batchAcc.cachedGetValue<+1, 0, 0>(vo, leafMask); +auto u_p2 = batchAcc.cachedGetValue<+2, 0, 0>(vo, leafMask); +auto u_p3 = batchAcc.cachedGetValue<+3, 0, 0>(vo, leafMask); +// ... y and z axes similarly + +Simd flux_x = wenoKernel(u_m3, u_m2, u_m1, u_0, u_p1, u_p2, u_p3); +``` + +**Tier 2 — prototyping path** (lazy, correct, no explicit prefetch management): + +```cpp +// Identical stencil formula; each getValue probes lazily on first need +auto u_m3 = batchAcc.getValue<-3, 0, 0>(vo, leafMask, treeAcc); +auto u_m2 = batchAcc.getValue<-2, 0, 0>(vo, leafMask, treeAcc); +// ... +``` + +The redundant `prefetch` calls inside non-extremal `getValue` invocations reduce to +a single `mProbedMask` bitmask check and immediate return — the direction was already +probed by an earlier extremal call. + +### 5c. Invariant Ordering + +In Tier 1, all `prefetch` calls must precede all `cachedGetValue` calls for the same +batch. A debug-mode RAII scope guard (`batchAcc.beginGather()` / `endGather()`) could +enforce this, but is probably overkill for a first implementation. + +--- + +## 6. Template vs Runtime Interface + +### 6a. Arguments for `` Template Parameters + +- **Compile-time direction resolution**: for `cachedGetValue<-3,0,0>`, the compiler + proves only lx can cross, and only leftward. The direction bit reduces to a + compile-time choice between two constants (`mPtrs[4]` or `mPtrs[13]`); y/z + boundary checks are eliminated entirely. +- **Dead axis elimination**: for axis-aligned taps, two of the three axis checks + vanish at compile time. +- **VDB convention alignment**: `WenoPt::idx`, `NineteenPt::idx` — + the ecosystem already addresses stencil points as compile-time named entities. +- **Structural contract**: the `prefetch`/`cachedGetValue` pairing is expressible as + a static invariant when offsets are compile-time constants. + +### 6b. When Runtime `nanovdb::Coord` Is Needed + +A generic `computeStencil` that iterates over `StencilT::offsets` at +runtime cannot use template parameters. A runtime overload: + +```cpp +Simd getValue(nanovdb::Coord offset, + Simd vo, + LaneMask leafMask, AccT& treeAcc); +``` + +dispatches through a small switch on the runtime direction bit (26 cases, easily +predicted). The gather still dominates; the dispatch overhead is negligible. + +**C++20 note**: if `nanovdb::Coord` is made a structural type, the template and +runtime interfaces unify naturally: + +```cpp +template +Simd cachedGetValue(Simd vo, LaneMask leafMask) const; + +// Called as: +batchAcc.cachedGetValue(vo, leafMask); +``` + +### 6c. Recommendation + +- **Template ``** as the primary, idiomatic interface for all hand-written + stencil kernels — cleaner codegen, natural fit with VDB conventions. +- **Runtime `Coord` overload** for generic stencil adapters and prototyping loops. +- Both interfaces backed by the same `mPtrs[]` / `mProbedMask` state machine. + +--- + +## 7. AVX2 Vectorization Profile + +### 7a. `prefetch` — Crossing Detection + +``` +Extract lx/ly/lz from all 16 vo lanes vpsrl / vpand ymm (SIMD) +Compare lx+di against [0,7] vpcmpgtd ymm (SIMD) +Fold crossing mask to scalar bitmask vmovmskps ymm (SIMD) +AND with ~mProbedMask scalar bitmask check +If new direction needed: probeLeaf scalar (≤1 call per prefetch for WENO5) +``` + +Structurally identical to the `computeNeededDirs` carry trick in the prototype +(indeed, `prefetch` is `computeNeededDirs` specialized to a single tap). + +### 7b. `cachedGetValue` — Offset Arithmetic and Gather + +``` +Compute neighbor offsets for all 16 lanes: + nx[i] = lx[i] + di, wrapped to [0,7] vpaddd / vpand ymm (SIMD, constant di) + local offset[i] = nx[i]*64 + ny[i]*8 + nz[i] vpmadd / vpaddd ymm + +Determine which lanes cross to neighbor leaf: + crossMask = (lx < threshold) vpcmpgtd ymm (SIMD) + +Gather values from (at most) two leaf arrays: + centerVals = gather(mPtrs[13]->array, offset) vgatherdps ymm (SIMD) + neighborVals = gather(mPtrs[dir]->array, offset_wrapped) vgatherdps ymm (SIMD) + result = blend(crossMask, neighborVals, centerVals) vpblendvb ymm (SIMD) +``` + +The key insight: for axis-aligned WENO5 taps, there are **at most two distinct leaf +pointers** across all 16 lanes. This reduces the gather to two base-pointer loads +plus a predicated blend — a clean AVX2 pattern. + +### 7c. Comparison to Phase 1 Prototype + +The two scalar bottlenecks in the prototype are eliminated by `BatchAccessor`: + +| Phase 1 bottleneck | `BatchAccessor` replacement | AVX2? | +|--------------------|----------------------------|-------| +| `expandVoxelOffset` scatter (conditional per-lane) | `cachedGetValue` offset arithmetic (uniform SIMD add) | ✓ | +| `batchPtrs` fill (pointer scatter, data-dependent) | Crossing mask + gather + blend | ✓ | +| `probeLeaf` loop | `prefetch` (≤1 probeLeaf per call) | inherently scalar | + +The WENO kernel itself (`wenoKernel(u_m3, ..., u_p3)`) operates entirely on +`Simd` with no tree access in sight. + +### 7d. Complete Per-Batch AVX2 Profile + +| Operation | Instructions | Vectorized? | +|-----------|-------------|-------------| +| `activeMask` computation | `vpcmpeqd ymm ×4` + `vmovmskps ×2` | ✓ Full | +| `leafMask` computation | `vpbroadcastd` + `vpcmpeqd ymm ×2` + `vmovmskps ×2` | ✓ Full | +| `prefetch` crossing detection | `vpcmpgtd ymm` + `vmovmskps` | ✓ Full | +| `probeLeaf` (per prefetch) | scalar tree traversal | inherently scalar | +| `cachedGetValue` offset arithmetic | `vpaddd ymm` / `vpand ymm` | ✓ Full | +| `cachedGetValue` lane split | `vpcmpgtd ymm` | ✓ Full | +| `cachedGetValue` value gather | `vgatherdps ymm ×2` + `vpblendvb ymm` | ✓ Full | +| WENO kernel | `Simd` arithmetic | ✓ Full | + +--- + +## 8. Scoping and Lifetime + +A `BatchAccessor` is scoped to **one CPU thread**, constructed once before the block +loop and reused across all batches and all blocks: + +```cpp +BatchAccessor batchAcc(grid, firstLeafID[0]); + +for (uint32_t bID = 0; bID < nBlocks; bID++) { + decodeInverseMaps(..., leafIndex, voxelOffset); + + for (int b = 0; b < BlockWidth; b += SIMDw) { + // compute activeMask, leafMask ... + while (any_of(activeMask)) { + if (none_of(leafMask)) { + batchAcc.advance(++currentLeafID); + continue; + } + // prefetch / cachedGetValue / kernel ... + } + } +} +``` + +**Cross-block carryover**: resetting `mProbedMask` between blocks is safe and simple. +Carrying over is also valid — consecutive blocks process spatially adjacent leaves, +so some `mPtrs[]` entries may still be correct. In practice, resetting is recommended +(one `mProbedMask = 0` per block, negligible cost) to avoid subtle stale-pointer bugs. + +--- + +## 9. Relationship to the Phase 1 Prototype + +`ex_stencil_gather_cpu` (`stencil_gather_cpu.cpp`) implements the core cache +machinery as free functions: + +| Prototype component | `BatchAccessor` equivalent | +|--------------------|-----------------------------| +| `probedMask` + `ptrs[27]` locals | `mProbedMask` + `mPtrs[27]` members | +| `computeNeededDirs(expandedVec)` | inner logic of `prefetch` (one tap) | +| `kSentinelExpanded` broadcast | same sentinel in `prefetch` for straddle lanes | +| `probeLeaf` loop (`toProbe` bits) | `prefetch` body | +| `batchPtrs[4][SIMDw]` population | replaced by `cachedGetValue` gather + blend | +| `verifyBatchPtrs` | future: `cachedGetValue` unit test | + +Phase 2 (not yet implemented): `cachedGetValue` — the actual index/value gather from +the cached leaf pointers. The AVX2 machinery for crossing detection and offset +arithmetic is a direct extension of what is already working and verified in Phase 1. + +--- + +## 10. Open Questions / Future Work + +- **`ValueOnIndex` two-level fetch**: `cachedGetValue` returns `Simd` + indices for index grids; a `cachedGetValue(channel, vo, leafMask)` overload + dereferences through a channel pointer in one step. Channel data layout (AoS vs SoA) + affects gather efficiency. + +- **Multi-leaf stencils (R > 4)**: the single-neighbor-per-axis assumption breaks for + stencils with reach R > 4 (a center voxel can simultaneously need both the lo and hi + neighbor along the same axis). `mPtrs[27]` remains correct; only the `cachedGetValue` + lane-split logic (currently "at most 2 leaf pointers per axis tap") needs generalization. + +- **Generic stencil adapter**: a `computeStencil` wrapper that calls + `getValue(StencilT::offset(n), ...)` for `n = 0..N-1` via the runtime `Coord` + overload — correctness-first entry point for new stencil types. + +- **C++20 structural `Coord`**: unify template and runtime interfaces with + `cachedGetValue(vo, leafMask)` non-type template parameter. + +- **Debug-mode RAII scope guard**: enforce the prefetch-before-cachedGetValue ordering + in debug builds without any runtime cost in release. + +- **Launcher integration**: the `BatchAccessor` is a per-block, per-thread object. + The system-level launcher (the `buildVoxelBlockManager` analogue for stencil + computation) constructs one per worker thread and passes it into the per-block kernel. + Design of the launcher is deferred until the per-block kernel is fully validated. diff --git a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilGather.md b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilGather.md index f8795353d4..eb77a022f2 100644 --- a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilGather.md +++ b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilGather.md @@ -14,6 +14,12 @@ kernel lambda that operates on the gathered values and produces the output. --- +**Related design document**: `BatchAccessor.md` (same directory) — full design of the +SIMD batch leaf-neighborhood cache that provides the `prefetch` / `cachedGetValue` / +`getValue` API used by the stencil kernel in Phase 2. + +--- + ## 1. Scope and Place in the Architecture The stencil gather sits at the **second level** of the two-level VBM parallelism From 8db0c52be9a58086e304cd020bc09856d7f7839a Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Wed, 15 Apr 2026 15:01:14 -0500 Subject: [PATCH 16/60] BatchAccessor: SIMD batch leaf-neighborhood cache (Phase 1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduces BatchAccessor, the SIMD analog of NanoVDB's ValueAccessor. Instead of caching the path to one leaf, it caches the 27-entry 3×3×3 neighbor pointer table around the current center leaf and serves a SIMD batch of LaneWidth voxels per call. Key design decisions -------------------- - Eager center: constructor and advance() populate mLeafNeighbors[13] directly (O(1), no probeLeaf), so cachedGetValue<0,0,0> is valid immediately and the probe loop never needs a center special-case. - SWAR neededMask: prefetch expands the 9-bit voxel offsets into a 15-bit packed form (lz@[0:2], lx@[6:8], ly@[12:14]) using SIMD bitwise ops, then adds a compile-time packed stencil offset and checks carry bits for crossing detection. One vpaddw YMM instruction covers all 16 lanes; clang folds packed_d into the blend constants at compile time, reducing the expand+blend+add to 5 SIMD instructions. - Heterogeneous where: Simd.h gains where(SimdMask, ...) so a PredicateT=SimdMask can gate a VoxelOffsetT=Simd blend without explicit casting. Array backend uses a trivial bool loop; stdx backend converts via a bool[] round-trip. - Correctness verified in-process: stencil_gather_cpu.cpp integrates BatchAccessor as an alternate execution path and cross-checks all 18 WENO5 tap directions against direct tree references (12.3M lane checks). Simd.h additions (array backend) --------------------------------- - SimdMask: converting constructor from SimdMask - Simd: operator|, &, ^, <<(Simd), >>(Simd) - where: heterogeneous mask overload (both backends) Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Efty Sifakis --- .../stencil_gather_cpu.cpp | 141 ++++- .../BatchAccessor.h | 330 ++++++++++++ .../BatchAccessor.md | 510 +++++++----------- simd_test/Simd.h | 143 +++++ 4 files changed, 806 insertions(+), 318 deletions(-) create mode 100644 nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.h diff --git a/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp b/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp index 88b03a62b1..2488e13c3b 100644 --- a/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp +++ b/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp @@ -35,6 +35,7 @@ #include #include #include // SimdMask, Simd, any_of, none_of, to_bitmask +#include "../ex_voxelBlockManager_host_cuda/BatchAccessor.h" // BatchAccessor #include #include @@ -63,6 +64,14 @@ using AccT = nanovdb::DefaultReadAccessor; using LeafIdxVec = nanovdb::util::Simd; using LaneMask = nanovdb::util::SimdMask; +// BatchAccessor instantiation for correctness cross-validation. +// ValueT = Simd because ValueOnIndex leaf values are uint64_t active indices. +using BAccT = nanovdb::BatchAccessor, // ValueT + nanovdb::util::Simd, // VoxelOffsetT + nanovdb::util::Simd, // LeafIDT (unused by BatchAccessor internals) + LaneMask>; // PredicateT + // Direction bit encoding shared across all stencil types: // bit(dx, dy, dz) = (dx+1)*9 + (dy+1)*3 + (dz+1), dx,dy,dz ∈ {-1,0,+1} // @@ -508,6 +517,112 @@ static void verifyBatchPtrs( } } +// ============================================================ +// BatchAccessor correctness verification +// +// checkOneTap: calls batchAcc.cachedGetValue for stencil tap (di,dj,dk), +// then for each active lane compares the result against a direct tree reference. +// +// Assumes the caller has already issued the 6 WENO5 extremal prefetches so that +// all directions reachable by ±3 along any axis are in mProbedMask. +// ============================================================ + +template +static void checkOneTap( + const BAccT& batchAcc, + nanovdb::util::Simd voVec, + LaneMask leafMask, + nanovdb::Coord centerLeafOrigin, + const LeafT* firstLeaf, + uint32_t currentLeafID, + const uint16_t* voxelOffset, + int batchStart, + AccT& refAcc, + VerifyStats& stats) +{ + nanovdb::util::Simd tapResult(uint64_t(0)); + batchAcc.cachedGetValue(tapResult, voVec, leafMask); + + for (int i = 0; i < SIMDw; ++i) { + if (!leafMask[i]) continue; + ++stats.laneChecks; + + const uint16_t vo_i = voxelOffset[batchStart + i]; + const int lx = (vo_i >> 6) & 7; + const int ly = (vo_i >> 3) & 7; + const int lz = vo_i & 7; + const int nx = lx + di, ny = ly + dj, nz = lz + dk; + const int dx = (nx < 0) ? -1 : (nx >= 8) ? 1 : 0; + const int dy = (ny < 0) ? -1 : (ny >= 8) ? 1 : 0; + const int dz = (nz < 0) ? -1 : (nz >= 8) ? 1 : 0; + const int nx_w = nx - dx * 8; + const int ny_w = ny - dy * 8; + const int nz_w = nz - dz * 8; + const uint32_t offset = uint32_t(nx_w) * 64u + uint32_t(ny_w) * 8u + uint32_t(nz_w); + + const LeafT* refLeaf; + if (dx == 0 && dy == 0 && dz == 0) { + refLeaf = &firstLeaf[currentLeafID]; + } else { + refLeaf = refAcc.probeLeaf( + centerLeafOrigin + nanovdb::Coord(dx * 8, dy * 8, dz * 8)); + } + + const uint64_t expected = refLeaf + ? static_cast(refLeaf->getValue(offset)) + : uint64_t(0); + const uint64_t actual = static_cast(tapResult[i]); + + if (actual != expected) { + ++stats.errors; + if (stats.errors <= 10) { + std::cerr << "BATCHACC MISMATCH" + << " tap=(" << di << "," << dj << "," << dk << ")" + << " lane=" << i + << " expected=" << expected + << " actual=" << actual << "\n"; + } + } + } +} + +/// @brief Cross-validate BatchAccessor::cachedGetValue for all 18 WENO5 non-center taps. +/// Requires the 6 extremal prefetches to have been called first. +static void verifyBatchAccessor( + const BAccT& batchAcc, + nanovdb::util::Simd voVec, + LaneMask leafMask, + nanovdb::Coord centerLeafOrigin, + const LeafT* firstLeaf, + uint32_t currentLeafID, + const uint16_t* voxelOffset, + int batchStart, + AccT& refAcc, + VerifyStats& stats) +{ + // x-axis taps (di in {-3,-2,-1,+1,+2,+3}) + checkOneTap<-3, 0, 0>(batchAcc, voVec, leafMask, centerLeafOrigin, firstLeaf, currentLeafID, voxelOffset, batchStart, refAcc, stats); + checkOneTap<-2, 0, 0>(batchAcc, voVec, leafMask, centerLeafOrigin, firstLeaf, currentLeafID, voxelOffset, batchStart, refAcc, stats); + checkOneTap<-1, 0, 0>(batchAcc, voVec, leafMask, centerLeafOrigin, firstLeaf, currentLeafID, voxelOffset, batchStart, refAcc, stats); + checkOneTap<+1, 0, 0>(batchAcc, voVec, leafMask, centerLeafOrigin, firstLeaf, currentLeafID, voxelOffset, batchStart, refAcc, stats); + checkOneTap<+2, 0, 0>(batchAcc, voVec, leafMask, centerLeafOrigin, firstLeaf, currentLeafID, voxelOffset, batchStart, refAcc, stats); + checkOneTap<+3, 0, 0>(batchAcc, voVec, leafMask, centerLeafOrigin, firstLeaf, currentLeafID, voxelOffset, batchStart, refAcc, stats); + // y-axis taps + checkOneTap< 0,-3, 0>(batchAcc, voVec, leafMask, centerLeafOrigin, firstLeaf, currentLeafID, voxelOffset, batchStart, refAcc, stats); + checkOneTap< 0,-2, 0>(batchAcc, voVec, leafMask, centerLeafOrigin, firstLeaf, currentLeafID, voxelOffset, batchStart, refAcc, stats); + checkOneTap< 0,-1, 0>(batchAcc, voVec, leafMask, centerLeafOrigin, firstLeaf, currentLeafID, voxelOffset, batchStart, refAcc, stats); + checkOneTap< 0,+1, 0>(batchAcc, voVec, leafMask, centerLeafOrigin, firstLeaf, currentLeafID, voxelOffset, batchStart, refAcc, stats); + checkOneTap< 0,+2, 0>(batchAcc, voVec, leafMask, centerLeafOrigin, firstLeaf, currentLeafID, voxelOffset, batchStart, refAcc, stats); + checkOneTap< 0,+3, 0>(batchAcc, voVec, leafMask, centerLeafOrigin, firstLeaf, currentLeafID, voxelOffset, batchStart, refAcc, stats); + // z-axis taps + checkOneTap< 0, 0,-3>(batchAcc, voVec, leafMask, centerLeafOrigin, firstLeaf, currentLeafID, voxelOffset, batchStart, refAcc, stats); + checkOneTap< 0, 0,-2>(batchAcc, voVec, leafMask, centerLeafOrigin, firstLeaf, currentLeafID, voxelOffset, batchStart, refAcc, stats); + checkOneTap< 0, 0,-1>(batchAcc, voVec, leafMask, centerLeafOrigin, firstLeaf, currentLeafID, voxelOffset, batchStart, refAcc, stats); + checkOneTap< 0, 0,+1>(batchAcc, voVec, leafMask, centerLeafOrigin, firstLeaf, currentLeafID, voxelOffset, batchStart, refAcc, stats); + checkOneTap< 0, 0,+2>(batchAcc, voVec, leafMask, centerLeafOrigin, firstLeaf, currentLeafID, voxelOffset, batchStart, refAcc, stats); + checkOneTap< 0, 0,+3>(batchAcc, voVec, leafMask, centerLeafOrigin, firstLeaf, currentLeafID, voxelOffset, batchStart, refAcc, stats); +} + // ============================================================ // Main prototype: Phase 1 (neighbor leaf resolution) + verification // ============================================================ @@ -559,6 +674,9 @@ static void runPrototype(const GridT* const LeafT* ptrs[27] = {}; nanovdb::Coord centerLeafCoord = firstLeaf[currentLeafID].origin(); + // BatchAccessor: alternate execution path for correctness cross-validation. + BAccT batchAcc(*grid, currentLeafID); + // Process SIMD batches. for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { @@ -588,6 +706,7 @@ static void runPrototype(const GridT* currentLeafID++; probedMask = 0; centerLeafCoord = firstLeaf[currentLeafID].origin(); + batchAcc.advance(currentLeafID); continue; } @@ -645,16 +764,34 @@ static void runPrototype(const GridT* } } - // --- Verification --- + // --- Verification (Phase 1 pointer check) --- verifyBatchPtrs(batchPtrs, firstLeaf, leafIndex, voxelOffset, batchStart, leafMask, acc, stats); + // --- BatchAccessor alternate path + cross-validation --- + // + // 6 extremal WENO5 prefetches cover all face-neighbor directions. + // The center direction (dir(0,0,0)) is guaranteed populated by at + // least one of these calls (see BatchAccessor.md §5). + using VoVecT = nanovdb::util::Simd; + const VoVecT voVec(&voxelOffset[batchStart], nanovdb::util::element_aligned); + batchAcc.prefetch<-3, 0, 0>(voVec, leafMask); + batchAcc.prefetch<+3, 0, 0>(voVec, leafMask); + batchAcc.prefetch< 0, -3, 0>(voVec, leafMask); + batchAcc.prefetch< 0, +3, 0>(voVec, leafMask); + batchAcc.prefetch< 0, 0, -3>(voVec, leafMask); + batchAcc.prefetch< 0, 0, +3>(voVec, leafMask); + + verifyBatchAccessor(batchAcc, voVec, leafMask, centerLeafCoord, + firstLeaf, currentLeafID, voxelOffset, + batchStart, acc, stats); + activeMask = activeMask & !leafMask; } } } - std::cout << "Prototype (Phase 1 verification):\n" + std::cout << "Prototype (Phase 1 + BatchAccessor verification):\n" << " blocks = " << nBlocks << "\n" << " voxels = " << nVoxels << "\n" << " straddles = " << nStraddles << "\n" diff --git a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.h b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.h new file mode 100644 index 0000000000..f2b3f18253 --- /dev/null +++ b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.h @@ -0,0 +1,330 @@ +// Copyright Contributors to the OpenVDB Project +// SPDX-License-Identifier: Apache-2.0 + +/*! + \file BatchAccessor.h + + \brief SIMD-batch analog of NanoVDB's ValueAccessor. + + Caches the 27-entry 3×3×3 leaf-neighbor pointer table around the current + center leaf, amortizing probeLeaf calls across all batches that process + voxels within that leaf. + + Design documented in: + nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.md + + Template parameters + ------------------- + BuildT NanoVDB build type (determines tree / leaf types). + ValueT Scalar or SIMD result type of cachedGetValue. + For NanoGrid: float or Simd + For NanoGrid: uint64_t or Simd + VoxelOffsetT Compact (9-bit) voxel offset within a leaf. + Scalar path: uint16_t. SIMD path: Simd. + LeafIDT Leaf index type — reserved for future use by the caller loop. + Scalar: uint32_t. SIMD: Simd. + PredicateT Per-lane active predicate (the leafMask). + Scalar: bool. SIMD: SimdMask or similar. + + Usage + ----- + Scalar defaults allow instantiation without a SIMD library. + For SIMD use, substitute the concrete Simd<> and SimdMask<> types. + + API (see BatchAccessor.md §5 for the full design): + - advance(newLeafID) — move to a new center leaf + - prefetch(vo, mask) — warm cache for tap (di,dj,dk) + - cachedGetValue(result, vo, mask) — fill masked result lanes +*/ + +#pragma once + +#include +#include // simd_traits, Simd, SimdMask +#include +#include + +namespace nanovdb { + +// ============================================================================= +// BatchAccessor +// ============================================================================= + +template +class BatchAccessor +{ + using GridT = NanoGrid; + using TreeT = typename GridT::TreeType; + using LeafT = typename TreeT::LeafNodeType; + + using VO_traits = util::simd_traits; + using Pred_traits = util::simd_traits; + using Val_traits = util::simd_traits; + + // Scalar element type of ValueT (e.g. float for Simd) + using ScalarValueT = typename Val_traits::scalar_type; + + static constexpr int LaneWidth = VO_traits::width; + + static_assert(VO_traits::width == Pred_traits::width, + "BatchAccessor: VoxelOffsetT and PredicateT must have the same lane width"); + static_assert(Val_traits::width == 1 || Val_traits::width == VO_traits::width, + "BatchAccessor: ValueT lane width must be 1 (scalar) or match VoxelOffsetT"); + +public: + // ------------------------------------------------------------------------- + // Direction encoding + // + // bit(dx,dy,dz) = (dx+1)*9 + (dy+1)*3 + (dz+1), dx,dy,dz ∈ {-1,0,+1} + // + // Selected entries: + // dir( 0, 0, 0) = 13 — center leaf (mLeafNeighbors[13]) + // dir(-1, 0, 0) = 4 — x-minus face + // dir(+1, 0, 0) = 22 — x-plus face + // dir( 0,-1, 0) = 10 — y-minus face + // dir( 0,+1, 0) = 16 — y-plus face + // dir( 0, 0,-1) = 12 — z-minus face + // dir( 0, 0,+1) = 14 — z-plus face + // ------------------------------------------------------------------------- + static constexpr int dir(int dx, int dy, int dz) + { + return (dx + 1) * 9 + (dy + 1) * 3 + (dz + 1); + } + + // ------------------------------------------------------------------------- + // Construction + // + // Eagerly populates mLeafNeighbors[dir(0,0,0)] (the center pointer) and + // marks bit 13 in mProbedMask. The center pointer is O(1) to compute + // (no probeLeaf needed), so there is no reason to defer it. + // + // Consequence: cachedGetValue<0,0,0> is valid immediately after construction + // without any prefetch call. The SWAR neededMask in prefetch never sets + // bit 13 (only crossings fire), so the eager center is never redundantly + // re-probed. + // ------------------------------------------------------------------------- + BatchAccessor(const GridT& grid, uint32_t firstLeafID) + : mGrid(grid) + , mCenterLeafID(firstLeafID) + , mCenterOrigin(grid.tree().getFirstLeaf()[firstLeafID].origin()) + , mProbedMask(1u << dir(0, 0, 0)) + { + for (auto& p : mLeafNeighbors) p = nullptr; + mLeafNeighbors[dir(0, 0, 0)] = &mGrid.tree().getFirstLeaf()[mCenterLeafID]; + } + + // ------------------------------------------------------------------------- + // advance — move to a new center leaf + // + // Call when none_of(leafMask): all active lanes have moved past mCenterLeafID. + // Repopulates the center pointer eagerly and resets mProbedMask to bit 13, + // so stale neighbor entries are blocked and the center is immediately valid. + // ------------------------------------------------------------------------- + void advance(uint32_t newLeafID) + { + mCenterLeafID = newLeafID; + mCenterOrigin = mGrid.tree().getFirstLeaf()[newLeafID].origin(); + mLeafNeighbors[dir(0,0,0)] = &mGrid.tree().getFirstLeaf()[newLeafID]; + mProbedMask = (1u << dir(0, 0, 0)); + } + + // ------------------------------------------------------------------------- + // prefetch — warm the neighbor cache for stencil tap (di,dj,dk) + // + // For each active (leafMask) lane, computes which neighbor leaf the tap lands + // in and probes it into mLeafNeighbors[] if not already cached in mProbedMask. + // + // The center direction (dir(0,0,0)) is always pre-populated by the constructor + // and advance(), so it never appears in neededMask and never needs probeLeaf. + // Every direction in toProbe is therefore a genuine neighbor: full root-to-leaf + // traversal via mGrid.tree().root().probeLeaf(). + // + // A null result from probeLeaf means the neighbor leaf does not exist (outside + // the narrow band); cachedGetValue returns 0 for those lanes. + // ------------------------------------------------------------------------- + template + void prefetch(VoxelOffsetT vo, PredicateT leafMask) + { + // ----------------------------------------------------------------------- + // SWAR neededMask computation + // + // Replace the scalar per-lane loop with a single SIMD add + two horizontal + // reductions, using a 15-bit packed coordinate representation. + // + // packed_lc layout (one group per axis, zero-guard gaps): + // bits 0– 2: lz carry exits at bit 3 (z-axis crossing) + // bits 6– 8: lx carry exits at bit 9 (x-axis crossing) + // bits 12–14: ly carry exits at bit 15 (y-axis crossing) + // + // This is expandVoxelOffset() steps 1+2 only (no step 3), because for a + // fixed (di,dj,dk) each axis has exactly one possible crossing direction, + // so we need only one group per axis rather than two. + // + // packed_d = 3-bit two's complement of each offset placed in the same groups: + // dk & 7 at bits [0:2] (= 8+dk for dk<0, dk for dk>=0) + // di & 7 at bits [6:8] + // dj & 7 at bits [12:14] + // + // After SIMD add(packed_lc, packed_d): + // carry at bit 3 SET ↔ lz + dk ≥ 8 ↔ hi-z crossing (dk > 0) + // carry at bit 3 CLEAR ↔ lz + dk < 0 ↔ lo-z crossing (dk < 0) + // (same logic for x@bit9 and y@bit15) + // + // Inactive lanes carry the sentinel (lc = 4 per axis), which satisfies + // |d| ≤ 4: never fires a false hi-carry, never clears a lo-carry. + // + // For multi-axis taps (more than one nonzero component), the per-axis + // may-cross flags are combined conservatively: if two axes can independently + // cross, the edge/corner direction combining both is also added to neededMask. + // This may over-probe (extra probeLeaf if no single lane crosses both axes + // simultaneously) but never misses a direction any lane actually needs. + // For axis-aligned WENO5 taps (one nonzero component) there is no over-probing. + // ----------------------------------------------------------------------- + + // Use VoxelOffsetT (Simd) directly for the packed + // arithmetic: 16 × uint16_t = 256 bits = one YMM → one vpaddw, vs two + // vpaddd if we widened to uint32_t. All intermediate values fit in uint16_t: + // packed_lc ≤ 0x71C7, packed_d ≤ 0x71C7, sum ≤ 0xE38E < 0xFFFF. + static_assert(LaneWidth >= 16, + "BatchAccessor::prefetch SWAR requires LaneWidth >= 16"); + + // Compile-time packed stencil offset (3-bit two's complement per axis). + // Using uint16_t arithmetic: dk & 7u fits in 3 bits, shifted into position. + static constexpr uint16_t packed_d = + static_cast( + (uint32_t(unsigned(dk) & 7u)) + | (uint32_t(unsigned(di) & 7u) << 6) + | (uint32_t(unsigned(dj) & 7u) << 12)); + + // Sentinel for inactive lanes: lc = (4,4,4) → packed = 4|(4<<6)|(4<<12) = 0x4104. + // Note: expandVoxelOffset(kInactiveVoxelOffset=292) = 0x4104 = kSentinel15. + // So even unconditionally expanded inactive-lane values would yield the correct + // sentinel. However, straddle lanes carry arbitrary vo values (from the next + // leaf), so we must apply leafMask before the add to avoid false crossing signals. + static constexpr uint16_t kSentinel15 = + static_cast(4u | (4u << 6u) | (4u << 12u)); + static constexpr uint16_t kMask15 = uint16_t(0x71C7u); + + // Expand the 9-bit voxel offset into the 15-bit SWAR packed form — + // one vpor + vpsllw + vpand (no scalar loop). + // bits [0:2] = lz, bits [6:8] = lx, bits [12:14] = ly + // Then blend: active lanes → expanded form, straddle/inactive → sentinel. + // util::where accepts SimdMask for any U (heterogeneous overload). + const VoxelOffsetT expanded = + (vo | (vo << VoxelOffsetT(uint16_t(9)))) & VoxelOffsetT(kMask15); + const VoxelOffsetT packed_lc = + util::where(leafMask, expanded, VoxelOffsetT(kSentinel15)); + + // One SIMD add across all LaneWidth lanes (one vpaddw YMM instruction). + const VoxelOffsetT packed_sum = packed_lc + VoxelOffsetT(packed_d); + + // Horizontal reductions: widen to uint32_t for the carry-bit checks. + uint32_t hor_or = 0u, hor_and = ~0u; + for (int i = 0; i < LaneWidth; ++i) { + const uint32_t s = static_cast( + static_cast(VO_traits::get(packed_sum, i))); + hor_or |= s; + hor_and &= s; + } + + // Per-axis may-cross flags: compile-time dispatch on sign of d. + bool x_cross = false, y_cross = false, z_cross = false; + if constexpr (di > 0) x_cross = bool(hor_or & (1u << 9)); + if constexpr (di < 0) x_cross = !bool(hor_and & (1u << 9)); + if constexpr (dj > 0) y_cross = bool(hor_or & (1u << 15)); + if constexpr (dj < 0) y_cross = !bool(hor_and & (1u << 15)); + if constexpr (dk > 0) z_cross = bool(hor_or & (1u << 3)); + if constexpr (dk < 0) z_cross = !bool(hor_and & (1u << 3)); + + // Compile-time crossing sign per axis. + constexpr int sx = (di > 0) ? 1 : -1; // only used when di != 0 + constexpr int sy = (dj > 0) ? 1 : -1; + constexpr int sz = (dk > 0) ? 1 : -1; + + // Build neededMask: face neighbors, then edge and corner (conservative). + uint32_t neededMask = 0u; + if constexpr (di != 0) { if (x_cross) neededMask |= (1u << dir(sx, 0, 0)); } + if constexpr (dj != 0) { if (y_cross) neededMask |= (1u << dir( 0, sy, 0)); } + if constexpr (dk != 0) { if (z_cross) neededMask |= (1u << dir( 0, 0, sz)); } + if constexpr (di != 0 && dj != 0) { if (x_cross && y_cross) neededMask |= (1u << dir(sx, sy, 0)); } + if constexpr (di != 0 && dk != 0) { if (x_cross && z_cross) neededMask |= (1u << dir(sx, 0, sz)); } + if constexpr (dj != 0 && dk != 0) { if (y_cross && z_cross) neededMask |= (1u << dir( 0, sy, sz)); } + if constexpr (di != 0 && dj != 0 && dk != 0) { if (x_cross && y_cross && z_cross) neededMask |= (1u << dir(sx, sy, sz)); } + + // Probe neighbor directions not already cached. + // Every direction here requires probeLeaf (center is pre-populated, never in toProbe). + uint32_t toProbe = neededMask & ~mProbedMask; + while (toProbe) { + const int d = __builtin_ctz(toProbe); + mLeafNeighbors[d] = mGrid.tree().root().probeLeaf(originForDir(d)); + mProbedMask |= (1u << d); + toProbe &= toProbe - 1; + } + } + + // ------------------------------------------------------------------------- + // cachedGetValue — fill masked result lanes from cached leaf table + // + // For each active (leafMask) lane, computes the local voxel offset within the + // appropriate neighbor leaf and calls leaf->getValue(offset). + // + // Requires prefetch (or any prefetch covering the same directions) + // to have been called first. Debug builds assert mProbedMask coverage. + // + // A null leaf pointer (neighbor outside the narrow band) writes 0 to result. + // Inactive lanes (leafMask[i] == false) are not touched. + // ------------------------------------------------------------------------- + template + void cachedGetValue(ValueT& result, VoxelOffsetT vo, PredicateT leafMask) const + { + for (int i = 0; i < LaneWidth; ++i) { + if (!Pred_traits::get(leafMask, i)) continue; + const auto vo_i = static_cast(VO_traits::get(vo, i)); + const int lx = (vo_i >> 6) & 7; + const int ly = (vo_i >> 3) & 7; + const int lz = vo_i & 7; + const int nx = lx + di, ny = ly + dj, nz = lz + dk; + const int dx = (nx < 0) ? -1 : (nx >= 8) ? 1 : 0; + const int dy = (ny < 0) ? -1 : (ny >= 8) ? 1 : 0; + const int dz = (nz < 0) ? -1 : (nz >= 8) ? 1 : 0; + // Wrapped local coordinate within the neighbor leaf. + const int nx_w = nx - dx * 8; + const int ny_w = ny - dy * 8; + const int nz_w = nz - dz * 8; + // NanoVDB leaf layout: offset = lx*64 + ly*8 + lz. + const uint32_t offset = uint32_t(nx_w) * 64u + + uint32_t(ny_w) * 8u + + uint32_t(nz_w); + const int d = dir(dx, dy, dz); + assert((mProbedMask & (1u << d)) && "cachedGetValue: direction not prefetched"); + const LeafT* leaf = mLeafNeighbors[d]; + const ScalarValueT val = leaf + ? static_cast(leaf->getValue(offset)) + : ScalarValueT(0); + Val_traits::set(result, i, val); + } + } + +private: + // Compute the world-space origin of the leaf at direction bit d from center. + // bit(dx,dy,dz) = (dx+1)*9 + (dy+1)*3 + (dz+1); leaf stride = 8 per axis. + Coord originForDir(int d) const + { + const int dx = d / 9 - 1; + const int dy = (d / 3) % 3 - 1; + const int dz = d % 3 - 1; + return mCenterOrigin + Coord(dx * 8, dy * 8, dz * 8); + } + + const GridT& mGrid; + uint32_t mCenterLeafID; + Coord mCenterOrigin; + uint32_t mProbedMask; + const LeafT* mLeafNeighbors[27]; +}; + +} // namespace nanovdb diff --git a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.md b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.md index 5331fbfe48..1353ee0cee 100644 --- a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.md +++ b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.md @@ -1,8 +1,7 @@ # BatchAccessor — SIMD Batch Leaf-Neighborhood Cache -This document is the design reference for `BatchAccessor`, the SIMD-batch analog -of NanoVDB's `ValueAccessor`. It captures the full design rationale developed -alongside the `ex_stencil_gather_cpu` Phase 1 prototype. +Design reference for `BatchAccessor.h`. Captures the full design rationale +and API contract developed alongside the `ex_stencil_gather_cpu` Phase 1 prototype. --- @@ -10,423 +9,302 @@ alongside the `ex_stencil_gather_cpu` Phase 1 prototype. NanoVDB's `DefaultReadAccessor` amortizes the cost of root-to-leaf tree traversal by caching the path for a single voxel. When successive scalar `getValue(ijk)` calls -land in the same leaf, only the first call pays the full traversal; subsequent calls -hit the cached leaf pointer in ~6 integer instructions. +land in the same leaf, only the first call pays the full traversal. `BatchAccessor` lifts this idea one level: instead of caching the path to one leaf, it caches the **3×3×3 neighborhood of leaf pointers** surrounding the current center -leaf. Instead of serving one voxel per call, it serves a **SIMD batch of SIMDw +leaf. Instead of serving one voxel per call, it serves a **SIMD batch of LaneWidth voxels** simultaneously. | Property | Scalar `ValueAccessor` | `BatchAccessor` | |----------|------------------------|-----------------| | Cache unit | Path root→leaf (3 node ptrs) | 27 neighbor leaf ptrs | -| Granularity | 1 voxel per call | SIMDw voxels per call | -| Cache key | Voxel coordinate in cached leaf's bbox | `currentLeafID` (VBM ordering) | +| Granularity | 1 voxel per call | LaneWidth voxels per call | +| Cache key | Voxel coordinate in cached leaf's bbox | `mCenterLeafID` | | "Hit" condition | Next voxel in same leaf | `mProbedMask` covers needed direction | | Eviction trigger | Implicit on any miss | Explicit: `none_of(leafMask)` | -| Guarantee of hit rate | Access-pattern dependent | Structural (VBM Morton ordering) | +| Hit rate guarantee | Access-pattern dependent | Structural (VBM Morton ordering) | The hit rate of the scalar accessor depends on the access pattern. `BatchAccessor`'s -amortization is **structural**: the VBM groups voxels by leaf, so within any batch, -the center leaf is known in advance, and directions probed for batch k remain valid -for all subsequent batches in the same center leaf. +amortization is **structural**: the VBM groups voxels by leaf, so within any batch the +center leaf is known in advance, and directions probed for batch k remain valid for all +subsequent batches in the same center leaf. --- -## 2. Cache State - -Four pieces of state persist across batches within one center leaf: - -```cpp -template -class BatchAccessor { - uint32_t mProbedMask = 0; // bit d set ↔ direction d has been probed - const LeafT* mPtrs[27] = {}; // canonical neighbor table; mPtrs[13] = center - uint32_t mCurrentLeafID; // index of current center leaf - nanovdb::Coord mCenterLeafCoord; // origin of current center leaf - // (plus a reference to the underlying grid for probeLeaf calls) -}; -``` - -`mPtrs[27]` uses the shared 3×3×3 direction encoding from `StencilGather.md §6a`: - -``` -bit(dx, dy, dz) = (dx+1)*9 + (dy+1)*3 + (dz+1) dx,dy,dz ∈ {-1, 0, +1} -``` - -`mPtrs[13]` (the center, `bit(0,0,0)`) always points to -`&tree.getFirstNode<0>()[mCurrentLeafID]`. The 26 non-center entries are populated -lazily by `prefetch` calls. - -**Cache advance:** when `mCurrentLeafID` changes: +## 2. Template Parameters ```cpp -void advance(uint32_t newLeafID) { - mCurrentLeafID = newLeafID; - mProbedMask = 0; // stale neighbor ptrs; force re-probe before use - mCenterLeafCoord = tree.getFirstNode<0>()[newLeafID].origin(); - // mPtrs[] entries are stale but harmless; mProbedMask=0 prevents their use -} +template +class BatchAccessor; ``` ---- - -## 3. Eviction and the `leafMask` — The Straddle Problem - -This is the key structural difference from the scalar accessor. - -In the scalar case, "cache miss" and "eviction" are the same event — the single voxel -is either in the cached leaf or it isn't. In the batch case they decouple: +| Parameter | Scalar default | SIMD example | Role | +|-----------|---------------|--------------|------| +| `BuildT` | — | — | NanoVDB build type; determines `LeafT`, `TreeT` | +| `ValueT` | `float` | `Simd` | Return type of `cachedGetValue` | +| `VoxelOffsetT` | `uint16_t` | `Simd` | Compact 9-bit voxel offset within a leaf | +| `LeafIDT` | `uint32_t` | `Simd` | Per-lane leaf ID (reserved for caller loop) | +| `PredicateT` | `bool` | `SimdMask` | Per-lane active predicate | -- **Straddle lanes**: active voxels in the batch that belong to a *later* leaf - (`leafIndex[i] != currentLeafID`, `leafMask[i] = false`). The cache is still valid - for the remaining current-leaf lanes. No eviction. -- **Eviction**: `none_of(leafMask)` — no lane in this batch belongs to the current - leaf. Only then does `advance()` fire. +For `NanoGrid`, use `ValueT = uint64_t` (scalar) or +`ValueT = Simd` (SIMD). -`leafMask` is therefore the accessor's **partial-hit signal** — a concept that has -no scalar analog. Without it, the accessor would evict prematurely on every straddle -batch, losing the cross-batch amortization that makes `mProbedMask` valuable. +The scalar defaults allow instantiation without a SIMD library, giving a clean +scalar path for debugging and cross-validation. -The straddle lane problem is solved at the call site by masking: straddle lanes receive -a sentinel voxelOffset value (`kSentinelExpanded = expandVoxelOffset(292)`, local -coordinate (4,4,4)) that produces no false direction bits in either the plus-OR or -minus-AND reduction. This is already implemented and verified in the Phase 1 -prototype (`ex_stencil_gather_cpu`). +Per-lane access is provided by `nanovdb::util::simd_traits` (defined in `Simd.h`), +which works for both scalar and vector types via specialisation. --- -## 4. The Prefetch Insight — Extremal Taps as a Neighborhood Census +## 3. Persistent State -The naive "vanilla accessor" approach would issue a `probeLeaf` call on first access -for each stencil tap, lazily. The `BatchAccessor` exploits **domain-specific -knowledge of the stencil geometry** to warm the cache with a minimal set of -strategically chosen taps — the *extremal* taps — that together constitute a complete -census of the neighborhood. +Four members persist across batches within one center leaf: -### 4a. WENO5 (Axis-Aligned, Reach R=3) — 6 Extremal Taps +```cpp +const GridT& mGrid; // for probeLeaf calls via mGrid.tree() +uint32_t mCenterLeafID; // index of current center leaf +Coord mCenterOrigin; // world-space origin of current center leaf +uint32_t mProbedMask = (1u << 13); // bit 13 (center) pre-set at construction +const LeafT* mLeafNeighbors[27]; // [13] = center (eager); others: lazily probed +``` -For an axis-aligned stencil, only one axis can cross a leaf boundary per tap. The -condition for needing the x+ neighbor leaf is: +**Direction encoding** (`dir` is a `static constexpr` member): ``` -∃ delta ∈ {1..R} s.t. lx + delta ≥ 8 ↔ lx ≥ 8 − R +dir(dx, dy, dz) = (dx+1)*9 + (dy+1)*3 + (dz+1) dx,dy,dz ∈ {-1,0,+1} ``` -The extremal tap at `+R` detects exactly `lx + R ≥ 8 ↔ lx ≥ 8 − R` — which is the -**necessary and sufficient condition** for needing x+ at all. Any smaller delta for -the same voxel would probe the same x+ leaf if it crosses, or not cross at all. +`mLeafNeighbors[27]` is a flat array indexed by `dir(dx,dy,dz)`. +`mLeafNeighbors[13]` (= `dir(0,0,0)`) is the center leaf pointer. +`mLeafNeighbors[d]` is `nullptr` when the neighbor leaf lies outside the narrow band. -Therefore, prefetching the 6 extremal taps covers all directions needed by any -intermediate tap: +**Why pointers, not leaf IDs:** `cachedGetValue` accesses the leaf data array for +every active lane in every batch. Storing `const LeafT*` avoids a `base + id * +sizeof(LeafT)` multiply on every call; `nullptr` is a natural "outside narrow band" +sentinel. `NanoVDB::ReadAccessor` uses the same approach for its cached node pointers. -``` -prefetch<+R, 0, 0>, prefetch<-R, 0, 0> → x+ / x- face leaves -prefetch< 0,+R, 0>, prefetch< 0,-R, 0> → y+ / y- face leaves -prefetch< 0, 0,+R>, prefetch< 0, 0,-R> → z+ / z- face leaves +**Cache advance:** when `none_of(leafMask)` fires in the outer loop: + +```cpp +void advance(uint32_t newLeafID) { + mCenterLeafID = newLeafID; + mCenterOrigin = mGrid.tree().getFirstLeaf()[newLeafID].origin(); + mLeafNeighbors[dir(0,0,0)] = &mGrid.tree().getFirstLeaf()[newLeafID]; + mProbedMask = (1u << dir(0,0,0)); // center pre-set; neighbors stale +} ``` -For WENO5 with R=3: **6 probeLeaf calls maximum** per center leaf, covering all -19 stencil taps. This is identical to what `computeNeededDirs` computes (the carry -trick encodes all 6 thresholds simultaneously). +Stale neighbor entries in `mLeafNeighbors[]` are harmless: `mProbedMask` has only +bit 13 set, so `toProbe = neededMask & ~mProbedMask` will never return a stale index. -### 4b. 3×3×3 Box Stencil (R=1) — 8 Corner Taps +--- -For the box stencil, a stencil tap at `(lx+dx, ly+dy, lz+dz)` where `dx,dy,dz ∈ -{-1,0,+1}` can cross one, two, or three axes simultaneously (face, edge, or corner -neighbor leaf respectively). +## 4. Eviction and the Straddle Problem -**Claim**: the 8 corner taps `(±1, ±1, ±1)` collectively cover all 26 non-center -neighbor directions for any voxel position in the batch. +In a SIMD batch, "straddle lanes" are active voxels that belong to a *later* leaf +(`leafIndex[i] != mCenterLeafID`, `leafMask[i] = false`). They do NOT trigger an +eviction — the cache is still valid for the remaining current-leaf lanes. -**Coverage argument**: For any voxel `(lx, ly, lz)` and any direction -`(dx, dy, dz)` that the stencil actually needs (i.e., some coordinate crosses a leaf -boundary), there exists a corner tap `(sx, sy, sz)` with `sx, sy, sz ∈ {-1, +1}` -such that when applied to this voxel it probes the **same neighbor leaf**. +Eviction fires only when `none_of(leafMask)` — no lane in the batch belongs to the +current leaf. -Concretely, the corner tap `(-1,-1,+1)` applied to voxel `(0, 0, 4)` accesses -`(-1, -1, 5)`, which falls in the `(x−, y−)` edge leaf — the same leaf needed by -the edge tap `(-1, -1, 0)` for this voxel. The corner tap `(-1,+1,-1)` for the -same voxel accesses `(-1, 1, 3)`, falling in the `x−` face leaf — the same leaf -needed by `(-1, 0, 0)`. +`leafMask` is the accessor's **partial-hit signal** — a concept with no scalar analog. -Each corner tap, applied to varying voxel positions in the batch, will probe face, -edge, or corner leaves depending on how many axes actually cross — collectively -exhausting all 26 directions across the batch. +Straddle lanes are given the inactive sentinel voxel offset `kInactiveVoxelOffset` +(= local coordinate (4,4,4)), which is strictly interior to the leaf and generates +no false crossing detections. The outer `while (any_of(activeMask))` loop processes +one leaf ID per iteration, re-using the same SIMD batch: -**At most 8 probeLeaf calls** per center leaf for the full 27-point box stencil -(in practice fewer, since many corner taps land in the center leaf for interior -voxels, and `mProbedMask` prevents re-probing the same direction twice). +``` +while any_of(activeMask): + leafMask = activeMask & (leafIndex_vec == mCenterLeafID) + if none_of(leafMask): + acc.advance(++currentLeafID) + continue + # prefetch + cachedGetValue for leafMask lanes only + acc.prefetch<...>(vo, leafMask) + acc.cachedGetValue<...>(result, vo, leafMask) # fills leafMask lanes of result + activeMask &= ~leafMask +# all lanes now filled; call kernel once with complete result +``` --- -## 5. API — Three Tiers +## 5. Center Leaf Initialisation — Eager (Constructor and advance) -### 5a. Core Functions +`mLeafNeighbors[dir(0,0,0)]` (center) is populated **eagerly** by both the +constructor and `advance()`: ```cpp -// ── Tier 1a: warm the cache for a specific stencil offset ────────────────── -// For each active (leafMask) lane: compute which neighbor leaf the tap -// (di,dj,dk) falls in, probe it into mPtrs[] if not already in mProbedMask. -// Takes treeAcc — may call probeLeaf. -template -void prefetch(Simd vo, LaneMask leafMask, AccT& treeAcc); +mLeafNeighbors[dir(0,0,0)] = &mGrid.tree().getFirstLeaf()[mCenterLeafID]; +mProbedMask = (1u << dir(0,0,0)); // bit 13 pre-set +``` -// ── Tier 1b: read from cache (cache assumed warm) ────────────────────────── -// For each active lane: compute local offset within the cached neighbor leaf, -// fetch and return the value (or index for ValueOnIndex grids). -// Does NOT take treeAcc — guaranteed not to touch the tree. -// Debug builds assert mProbedMask covers the needed direction. -template -Simd cachedGetValue(Simd vo, LaneMask leafMask) const; +The center pointer is O(1) to compute — no `probeLeaf` traversal needed — so there +is no reason to defer it. -// ── Tier 2: lazy combined operation (vanilla accessor style) ─────────────── -// Equivalent to prefetch + cachedGetValue. -// Correct without explicit prefetch management; slightly suboptimal for -// repeated calls in the same center leaf (redundant bitmask checks). -template -Simd getValue(Simd vo, LaneMask leafMask, AccT& treeAcc); -``` +**Consequences:** -The presence or absence of `treeAcc` in the signature is self-documenting: -`cachedGetValue` is the only function that can be called in a "no tree access" -context, and the compiler enforces that it doesn't get one. +- `cachedGetValue<0,0,0>` (center tap) is valid immediately after construction or + `advance()`, without any `prefetch` call. +- The SWAR `neededMask` computed inside `prefetch` never needs to include bit 13: + crossings are detected per-axis, and a lane whose tap stays in the center leaf + contributes `dir(0,0,0)` which is already in `mProbedMask` and filtered by + `toProbe = neededMask & ~mProbedMask`. +- The `if (d == dir(0,0,0))` special case is removed from the probe loop: every + direction in `toProbe` is a genuine neighbor requiring `probeLeaf`. -### 5b. Usage Patterns +--- + +## 6. API -**Tier 1 — production path** (explicit prefetch, recommended for performance-critical -stencil kernels): +### 6a. Direction Helper ```cpp -// Warm the cache with the 6 WENO5 extremal taps -batchAcc.prefetch<-3, 0, 0>(vo, leafMask, treeAcc); -batchAcc.prefetch<+3, 0, 0>(vo, leafMask, treeAcc); -batchAcc.prefetch< 0,-3, 0>(vo, leafMask, treeAcc); -batchAcc.prefetch< 0,+3, 0>(vo, leafMask, treeAcc); -batchAcc.prefetch< 0, 0,-3>(vo, leafMask, treeAcc); -batchAcc.prefetch< 0, 0,+3>(vo, leafMask, treeAcc); - -// All cachedGetValue calls are pure arithmetic + gather — no tree access -auto u_m3 = batchAcc.cachedGetValue<-3, 0, 0>(vo, leafMask); -auto u_m2 = batchAcc.cachedGetValue<-2, 0, 0>(vo, leafMask); -auto u_m1 = batchAcc.cachedGetValue<-1, 0, 0>(vo, leafMask); -auto u_0 = batchAcc.cachedGetValue< 0, 0, 0>(vo, leafMask); -auto u_p1 = batchAcc.cachedGetValue<+1, 0, 0>(vo, leafMask); -auto u_p2 = batchAcc.cachedGetValue<+2, 0, 0>(vo, leafMask); -auto u_p3 = batchAcc.cachedGetValue<+3, 0, 0>(vo, leafMask); -// ... y and z axes similarly - -Simd flux_x = wenoKernel(u_m3, u_m2, u_m1, u_0, u_p1, u_p2, u_p3); +static constexpr int dir(int dx, int dy, int dz); ``` -**Tier 2 — prototyping path** (lazy, correct, no explicit prefetch management): +### 6b. Lifecycle ```cpp -// Identical stencil formula; each getValue probes lazily on first need -auto u_m3 = batchAcc.getValue<-3, 0, 0>(vo, leafMask, treeAcc); -auto u_m2 = batchAcc.getValue<-2, 0, 0>(vo, leafMask, treeAcc); -// ... +BatchAccessor(const GridT& grid, uint32_t firstLeafID); +void advance(uint32_t newLeafID); ``` -The redundant `prefetch` calls inside non-extremal `getValue` invocations reduce to -a single `mProbedMask` bitmask check and immediate return — the direction was already -probed by an earlier extremal call. - -### 5c. Invariant Ordering - -In Tier 1, all `prefetch` calls must precede all `cachedGetValue` calls for the same -batch. A debug-mode RAII scope guard (`batchAcc.beginGather()` / `endGather()`) could -enforce this, but is probably overkill for a first implementation. - ---- - -## 6. Template vs Runtime Interface - -### 6a. Arguments for `` Template Parameters - -- **Compile-time direction resolution**: for `cachedGetValue<-3,0,0>`, the compiler - proves only lx can cross, and only leftward. The direction bit reduces to a - compile-time choice between two constants (`mPtrs[4]` or `mPtrs[13]`); y/z - boundary checks are eliminated entirely. -- **Dead axis elimination**: for axis-aligned taps, two of the three axis checks - vanish at compile time. -- **VDB convention alignment**: `WenoPt::idx`, `NineteenPt::idx` — - the ecosystem already addresses stencil points as compile-time named entities. -- **Structural contract**: the `prefetch`/`cachedGetValue` pairing is expressible as - a static invariant when offsets are compile-time constants. - -### 6b. When Runtime `nanovdb::Coord` Is Needed - -A generic `computeStencil` that iterates over `StencilT::offsets` at -runtime cannot use template parameters. A runtime overload: +### 6c. Tier 1a — `prefetch` ```cpp -Simd getValue(nanovdb::Coord offset, - Simd vo, - LaneMask leafMask, AccT& treeAcc); +template +void prefetch(VoxelOffsetT vo, PredicateT leafMask); ``` -dispatches through a small switch on the runtime direction bit (26 cases, easily -predicted). The gather still dominates; the dispatch overhead is negligible. +- Computes the neighbor direction for each active lane. +- Probes at most one new leaf per unique direction per call (skips directions + already in `mProbedMask`). +- Calls `mGrid.tree().probeLeaf(coord)` directly — no `AccT` parameter. + `ReadAccessor` is not used because `probeLeaf` only hits the LEVEL=0 leaf cache, + which is never warm for neighbor leaves; the internal-node caches are bypassed + entirely for `GetLeaf` operations. +- The center direction is set from `mCenterLeafID` without `probeLeaf`. -**C++20 note**: if `nanovdb::Coord` is made a structural type, the template and -runtime interfaces unify naturally: +### 6d. Tier 1b — `cachedGetValue` ```cpp -template -Simd cachedGetValue(Simd vo, LaneMask leafMask) const; - -// Called as: -batchAcc.cachedGetValue(vo, leafMask); +template +void cachedGetValue(ValueT& result, VoxelOffsetT vo, PredicateT leafMask) const; ``` -### 6c. Recommendation +- Fills **only the `leafMask` lanes** of `result` (by reference). +- Inactive lanes are not touched — values from a previous iteration are preserved. +- This is the correct API for the straddle-aware outer loop: the caller declares + all stencil result variables before the `while` loop, fills them progressively + across iterations, and calls the kernel once after `activeMask` is empty. +- Requires the corresponding direction to be in `mProbedMask` (asserted in debug). +- `nullptr` leaf (outside narrow band) writes `ScalarValueT(0)`. -- **Template ``** as the primary, idiomatic interface for all hand-written - stencil kernels — cleaner codegen, natural fit with VDB conventions. -- **Runtime `Coord` overload** for generic stencil adapters and prototyping loops. -- Both interfaces backed by the same `mPtrs[]` / `mProbedMask` state machine. - ---- +### 6e. Deferred -## 7. AVX2 Vectorization Profile +`getValue` (lazy combined) and the runtime `nanovdb::Coord` overload +are not yet implemented. Both are additive and straightforward once the two +primitives above are validated. -### 7a. `prefetch` — Crossing Detection - -``` -Extract lx/ly/lz from all 16 vo lanes vpsrl / vpand ymm (SIMD) -Compare lx+di against [0,7] vpcmpgtd ymm (SIMD) -Fold crossing mask to scalar bitmask vmovmskps ymm (SIMD) -AND with ~mProbedMask scalar bitmask check -If new direction needed: probeLeaf scalar (≤1 call per prefetch for WENO5) -``` +--- -Structurally identical to the `computeNeededDirs` carry trick in the prototype -(indeed, `prefetch` is `computeNeededDirs` specialized to a single tap). +## 7. Prefetch Patterns -### 7b. `cachedGetValue` — Offset Arithmetic and Gather +### WENO5 (R=3, axis-aligned) — 6 extremal taps +```cpp +acc.prefetch<-3, 0, 0>(vo, leafMask); +acc.prefetch<+3, 0, 0>(vo, leafMask); +acc.prefetch< 0,-3, 0>(vo, leafMask); +acc.prefetch< 0,+3, 0>(vo, leafMask); +acc.prefetch< 0, 0,-3>(vo, leafMask); +acc.prefetch< 0, 0,+3>(vo, leafMask); +// All subsequent cachedGetValue calls are pure arithmetic — no tree access. +auto u_m3 = /* ... */; acc.cachedGetValue<-3,0,0>(u_m3, vo, leafMask); +auto u_m2 = /* ... */; acc.cachedGetValue<-2,0,0>(u_m2, vo, leafMask); +// ... 19 taps total +Simd flux_x = wenoKernel(u_m3, u_m2, u_m1, u_0, u_p1, u_p2, u_p3); ``` -Compute neighbor offsets for all 16 lanes: - nx[i] = lx[i] + di, wrapped to [0,7] vpaddd / vpand ymm (SIMD, constant di) - local offset[i] = nx[i]*64 + ny[i]*8 + nz[i] vpmadd / vpaddd ymm -Determine which lanes cross to neighbor leaf: - crossMask = (lx < threshold) vpcmpgtd ymm (SIMD) +### Box stencil (R=1) — 8 corner taps -Gather values from (at most) two leaf arrays: - centerVals = gather(mPtrs[13]->array, offset) vgatherdps ymm (SIMD) - neighborVals = gather(mPtrs[dir]->array, offset_wrapped) vgatherdps ymm (SIMD) - result = blend(crossMask, neighborVals, centerVals) vpblendvb ymm (SIMD) +```cpp +for each (sx,sy,sz) in {±1}³: + acc.prefetch(vo, leafMask); +// then cachedGetValue for all 27 taps ``` -The key insight: for axis-aligned WENO5 taps, there are **at most two distinct leaf -pointers** across all 16 lanes. This reduces the gather to two base-pointer loads -plus a predicated blend — a clean AVX2 pattern. - -### 7c. Comparison to Phase 1 Prototype - -The two scalar bottlenecks in the prototype are eliminated by `BatchAccessor`: +--- -| Phase 1 bottleneck | `BatchAccessor` replacement | AVX2? | -|--------------------|----------------------------|-------| -| `expandVoxelOffset` scatter (conditional per-lane) | `cachedGetValue` offset arithmetic (uniform SIMD add) | ✓ | -| `batchPtrs` fill (pointer scatter, data-dependent) | Crossing mask + gather + blend | ✓ | -| `probeLeaf` loop | `prefetch` (≤1 probeLeaf per call) | inherently scalar | +## 8. Implementation Notes -The WENO kernel itself (`wenoKernel(u_m3, ..., u_p3)`) operates entirely on -`Simd` with no tree access in sight. +### 8a. Lane loop in prefetch / cachedGetValue -### 7d. Complete Per-Batch AVX2 Profile +The current implementation uses a scalar `for (int i = 0; i < LaneWidth; ++i)` loop +over lanes, using `simd_traits::get` / `set` for per-lane access. This is correct +for both scalar (LaneWidth=1) and SIMD (LaneWidth=W) instantiations. -| Operation | Instructions | Vectorized? | -|-----------|-------------|-------------| -| `activeMask` computation | `vpcmpeqd ymm ×4` + `vmovmskps ×2` | ✓ Full | -| `leafMask` computation | `vpbroadcastd` + `vpcmpeqd ymm ×2` + `vmovmskps ×2` | ✓ Full | -| `prefetch` crossing detection | `vpcmpgtd ymm` + `vmovmskps` | ✓ Full | -| `probeLeaf` (per prefetch) | scalar tree traversal | inherently scalar | -| `cachedGetValue` offset arithmetic | `vpaddd ymm` / `vpand ymm` | ✓ Full | -| `cachedGetValue` lane split | `vpcmpgtd ymm` | ✓ Full | -| `cachedGetValue` value gather | `vgatherdps ymm ×2` + `vpblendvb ymm` | ✓ Full | -| WENO kernel | `Simd` arithmetic | ✓ Full | +`prefetch` is called at most once per direction per center leaf, so the loop is not +performance-critical. `cachedGetValue` is in the hot path; the loop over W=16 lanes +with scalar per-lane `leaf->getValue(offset)` is a first correct implementation. +Vectorising this loop (SIMD offset arithmetic + `vgatherdps`) is the Phase 2 +optimisation task described in `StencilGather.md §7b`. ---- +### 8b. No tree accessor in prefetch -## 8. Scoping and Lifetime +NanoVDB's `ReadAccessor` is not passed to `prefetch`. Its LEVEL=0 leaf cache is never +warm for neighbor leaves (by definition distinct from the center leaf), and its +internal-node caches are bypassed entirely when `get` misses at LEVEL=0. +`probeLeaf` is equivalent to a direct root traversal in all non-trivial cases. -A `BatchAccessor` is scoped to **one CPU thread**, constructed once before the block -loop and reused across all batches and all blocks: +### 8c. probeLeaf returns nullptr for missing neighbors -```cpp -BatchAccessor batchAcc(grid, firstLeafID[0]); - -for (uint32_t bID = 0; bID < nBlocks; bID++) { - decodeInverseMaps(..., leafIndex, voxelOffset); - - for (int b = 0; b < BlockWidth; b += SIMDw) { - // compute activeMask, leafMask ... - while (any_of(activeMask)) { - if (none_of(leafMask)) { - batchAcc.advance(++currentLeafID); - continue; - } - // prefetch / cachedGetValue / kernel ... - } - } -} -``` - -**Cross-block carryover**: resetting `mProbedMask` between blocks is safe and simple. -Carrying over is also valid — consecutive blocks process spatially adjacent leaves, -so some `mPtrs[]` entries may still be correct. In practice, resetting is recommended -(one `mProbedMask = 0` per block, negligible cost) to avoid subtle stale-pointer bugs. +`mGrid.tree().probeLeaf(coord)` returns `nullptr` when the requested coordinate lies +outside the active narrow band. `cachedGetValue` checks for `nullptr` and returns +`ScalarValueT(0)`, which is correct for level-set grids (background value = 0). --- -## 9. Relationship to the Phase 1 Prototype +## 9. Relationship to Phase 1 Prototype -`ex_stencil_gather_cpu` (`stencil_gather_cpu.cpp`) implements the core cache -machinery as free functions: +`ex_stencil_gather_cpu` implements the core cache machinery as free functions. | Prototype component | `BatchAccessor` equivalent | |--------------------|-----------------------------| -| `probedMask` + `ptrs[27]` locals | `mProbedMask` + `mPtrs[27]` members | -| `computeNeededDirs(expandedVec)` | inner logic of `prefetch` (one tap) | -| `kSentinelExpanded` broadcast | same sentinel in `prefetch` for straddle lanes | -| `probeLeaf` loop (`toProbe` bits) | `prefetch` body | -| `batchPtrs[4][SIMDw]` population | replaced by `cachedGetValue` gather + blend | +| `probedMask` + `ptrs[27]` locals | `mProbedMask` + `mLeafNeighbors[27]` members | +| `computeNeededDirs(expandedVec)` | per-lane loop inside `prefetch` | +| `kSentinelExpanded` broadcast | sentinel applied by caller before `prefetch` | +| `probeLeaf` loop (`toProbe` bits) | `while (toProbe)` inside `prefetch` | +| `batchPtrs[4][SIMDw]` population | replaced by `cachedGetValue` | | `verifyBatchPtrs` | future: `cachedGetValue` unit test | -Phase 2 (not yet implemented): `cachedGetValue` — the actual index/value gather from -the cached leaf pointers. The AVX2 machinery for crossing detection and offset -arithmetic is a direct extension of what is already working and verified in Phase 1. - --- -## 10. Open Questions / Future Work +## 10. Future Work -- **`ValueOnIndex` two-level fetch**: `cachedGetValue` returns `Simd` - indices for index grids; a `cachedGetValue(channel, vo, leafMask)` overload - dereferences through a channel pointer in one step. Channel data layout (AoS vs SoA) - affects gather efficiency. +- **`cachedGetValue` vectorisation (Phase 2):** replace per-lane scalar loop with SIMD + offset arithmetic + `vgatherdps` × 2 + `vpblendvb` for the two-pointer case. + See `StencilGather.md §7b` for the AVX2 profile. -- **Multi-leaf stencils (R > 4)**: the single-neighbor-per-axis assumption breaks for - stencils with reach R > 4 (a center voxel can simultaneously need both the lo and hi - neighbor along the same axis). `mPtrs[27]` remains correct; only the `cachedGetValue` - lane-split logic (currently "at most 2 leaf pointers per axis tap") needs generalization. +- **`getValue`:** lazy combined `prefetch` + `cachedGetValue`. -- **Generic stencil adapter**: a `computeStencil` wrapper that calls - `getValue(StencilT::offset(n), ...)` for `n = 0..N-1` via the runtime `Coord` - overload — correctness-first entry point for new stencil types. +- **Runtime `Coord` overload:** for generic stencil adapters iterating over an offset + list at runtime. -- **C++20 structural `Coord`**: unify template and runtime interfaces with - `cachedGetValue(vo, leafMask)` non-type template parameter. +- **`StencilAccessor`:** higher-level wrapper that owns the `while (any_of)` loop, + hides straddling from the caller, and fills complete stencil result arrays. -- **Debug-mode RAII scope guard**: enforce the prefetch-before-cachedGetValue ordering - in debug builds without any runtime cost in release. +- **Multi-leaf stencils (R > 4):** the single-neighbor-per-axis assumption in + `cachedGetValue` holds for R ≤ 4. Generalisation requires checking both lo and hi + neighbors per axis. -- **Launcher integration**: the `BatchAccessor` is a per-block, per-thread object. - The system-level launcher (the `buildVoxelBlockManager` analogue for stencil - computation) constructs one per worker thread and passes it into the per-block kernel. - Design of the launcher is deferred until the per-block kernel is fully validated. +- **C++20 structural `Coord`:** unify template and runtime interfaces via + `cachedGetValue(result, vo, leafMask)`. diff --git a/simd_test/Simd.h b/simd_test/Simd.h index 407ce2a562..a757efe003 100644 --- a/simd_test/Simd.h +++ b/simd_test/Simd.h @@ -1,5 +1,6 @@ #pragma once #include +#include // Minimal SIMD abstraction for NanoVDB stencil kernels. // @@ -81,6 +82,17 @@ inline Simd where(SimdMask mask, Simd a, Simd b) { stdx::where(mask, result) = a; return result; } +// Heterogeneous where: mask element type U ≠ value element type T. +// Converts the U-mask to a T-mask via a boolean round-trip. +template +inline Simd where(SimdMask mask, Simd a, Simd b) { + bool arr[W]; + for (int i = 0; i < W; i++) arr[i] = static_cast(mask[i]); + SimdMask tmask(arr, element_aligned); + auto result = b; + stdx::where(tmask, result) = a; + return result; +} template inline bool any_of(SimdMask m) { return stdx::any_of(m); } @@ -89,6 +101,33 @@ inline bool none_of(SimdMask m) { return stdx::none_of(m); } template inline bool all_of(SimdMask m) { return stdx::all_of(m); } +// Unmasked gather: result[i] = ptr[idx[i]] for all lanes. +// Expressed as a generator constructor — Clang lowers to vgatherdps (all-ones mask). +template +inline Simd gather(const T* __restrict__ ptr, Simd idx) { + return Simd([&](int i) { return ptr[idx[i]]; }); +} + +// Masked gather: result[i] = mask[i] ? ptr[idx[i]] : fallback. +// Implemented as a full gather + where-blend; ptr is accessed for ALL lanes, +// so every idx[i] must be a valid offset regardless of mask[i]. +template +inline Simd gather(SimdMask mask, const T* __restrict__ ptr, + Simd idx, T fallback = T(0)) { + auto result = Simd(fallback); + stdx::where(mask, result) = Simd([&](int i) { return ptr[idx[i]]; }); + return result; +} + +// Merge-masked gather: dst[i] = mask[i] ? ptr[idx[i]] : dst[i] (unchanged). +// Mirrors vgatherdps merge-masking semantics: dst is both input and output. +// Hope: compiler emits a single vgatherdps with dst as the destination register. +template +inline void gather_if(Simd& dst, SimdMask mask, + const T* __restrict__ ptr, Simd idx) { + stdx::where(mask, dst) = Simd([&](int i) { return ptr[idx[i]]; }); +} + // =========================================================================== // Implementation B: std::array backend (default) // =========================================================================== @@ -97,6 +136,17 @@ inline bool all_of(SimdMask m) { return stdx::all_of(m); } template struct SimdMask { std::array data{}; + SimdMask() = default; + NANOVDB_SIMD_HOSTDEV explicit SimdMask(const bool* p, element_aligned_tag = {}) { + for (int i = 0; i < W; i++) data[i] = p[i]; + } + // Converting constructor: copy bool values from a mask over a different element type. + // All SimdMask are boolean arrays of the same width; this allows + // where(SimdMask, Simd, Simd) without explicit casting. + template + NANOVDB_SIMD_HOSTDEV explicit SimdMask(SimdMask const& o) { + for (int i = 0; i < W; i++) data[i] = o[i]; + } NANOVDB_SIMD_HOSTDEV bool operator[](int i) const { return data[i]; } NANOVDB_SIMD_HOSTDEV bool& operator[](int i) { return data[i]; } NANOVDB_SIMD_HOSTDEV SimdMask operator!() const { @@ -150,6 +200,22 @@ struct Simd { NANOVDB_SIMD_HOSTDEV SimdMask operator!=(Simd o) const { SimdMask m; for (int i = 0; i < W; i++) m.data[i] = data[i] != o.data[i]; return m; } + // Bitwise and shift operators — valid for integer element types. + NANOVDB_SIMD_HOSTDEV Simd operator|(Simd o) const { + Simd r; for (int i = 0; i < W; i++) r.data[i] = data[i] | o.data[i]; return r; + } + NANOVDB_SIMD_HOSTDEV Simd operator&(Simd o) const { + Simd r; for (int i = 0; i < W; i++) r.data[i] = data[i] & o.data[i]; return r; + } + NANOVDB_SIMD_HOSTDEV Simd operator^(Simd o) const { + Simd r; for (int i = 0; i < W; i++) r.data[i] = data[i] ^ o.data[i]; return r; + } + NANOVDB_SIMD_HOSTDEV Simd operator<<(Simd o) const { + Simd r; for (int i = 0; i < W; i++) r.data[i] = data[i] << o.data[i]; return r; + } + NANOVDB_SIMD_HOSTDEV Simd operator>>(Simd o) const { + Simd r; for (int i = 0; i < W; i++) r.data[i] = data[i] >> o.data[i]; return r; + } }; template NANOVDB_SIMD_HOSTDEV @@ -181,6 +247,12 @@ template NANOVDB_SIMD_HOSTDEV Simd where(SimdMask mask, Simd a, Simd b) { Simd r; for (int i = 0; i < W; i++) r[i] = mask[i] ? a[i] : b[i]; return r; } +// Heterogeneous where: mask element type U need not match value element type T. +// Useful for applying PredicateT=SimdMask to VoxelOffsetT=Simd. +template +NANOVDB_SIMD_HOSTDEV Simd where(SimdMask mask, Simd a, Simd b) { + Simd r; for (int i = 0; i < W; i++) r[i] = mask[i] ? a[i] : b[i]; return r; +} template NANOVDB_SIMD_HOSTDEV bool any_of(SimdMask m) { @@ -193,8 +265,79 @@ NANOVDB_SIMD_HOSTDEV bool all_of(SimdMask m) { bool r = true; for (int i = 0; i < W; i++) r &= m[i]; return r; } +// Unmasked gather: result[i] = ptr[idx[i]] for all lanes. +template +NANOVDB_SIMD_HOSTDEV Simd gather(const T* __restrict__ ptr, Simd idx) { + Simd r; + for (int i = 0; i < W; i++) r[i] = ptr[idx[i]]; + return r; +} + +// Masked gather: result[i] = mask[i] ? ptr[idx[i]] : fallback. +// Scalar path: accesses ptr only for true lanes (ternary short-circuits). +template +NANOVDB_SIMD_HOSTDEV Simd gather(SimdMask mask, const T* __restrict__ ptr, + Simd idx, T fallback = T(0)) { + Simd r; + for (int i = 0; i < W; i++) r[i] = mask[i] ? ptr[idx[i]] : fallback; + return r; +} + +// Merge-masked gather: dst[i] = mask[i] ? ptr[idx[i]] : dst[i] (unchanged). +// Scalar path: only accesses ptr for true lanes. +template +NANOVDB_SIMD_HOSTDEV void gather_if(Simd& dst, SimdMask mask, + const T* __restrict__ ptr, Simd idx) { + for (int i = 0; i < W; i++) + if (mask[i]) dst[i] = ptr[idx[i]]; +} + #endif // NANOVDB_USE_STD_SIMD +// --------------------------------------------------------------------------- +// simd_traits — generic per-lane access for scalar and Simd types. +// +// Lets algorithms be written once and work for both scalar (width=1) and +// vector (width=W) instantiations. The class does not need to know whether +// it is working with scalars or SIMD vectors. +// +// Primary template: scalar types. +// Specializations below: Simd and SimdMask (both backends). +// --------------------------------------------------------------------------- +template +struct simd_traits { + static constexpr int width = 1; + using scalar_type = T; + NANOVDB_SIMD_HOSTDEV static T get(T v, int) { return v; } + NANOVDB_SIMD_HOSTDEV static void set(T& v, int, T val) { v = val; } +}; + +template<> +struct simd_traits { + static constexpr int width = 1; + using scalar_type = bool; + NANOVDB_SIMD_HOSTDEV static bool get(bool m, int) { return m; } + NANOVDB_SIMD_HOSTDEV static void set(bool& m, int, bool v) { m = v; } +}; + +// Simd and SimdMask: valid for both backends because the aliases +// are already resolved by the time these specializations are instantiated. +template +struct simd_traits> { + static constexpr int width = W; + using scalar_type = T; + NANOVDB_SIMD_HOSTDEV static T get(Simd v, int i) { return v[i]; } + NANOVDB_SIMD_HOSTDEV static void set(Simd& v, int i, T val) { v[i] = val; } +}; + +template +struct simd_traits> { + static constexpr int width = W; + using scalar_type = bool; + NANOVDB_SIMD_HOSTDEV static bool get(SimdMask m, int i) { return m[i]; } + NANOVDB_SIMD_HOSTDEV static void set(SimdMask& m, int i, bool v) { m[i] = v; } +}; + // --------------------------------------------------------------------------- // to_bitmask — fold SimdMask into a uint32_t (one bit per lane). // T is the associated element type; only W matters. Requires W <= 32. From dce5240310d013219ada6ed608b275a7e9724529 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Wed, 15 Apr 2026 16:00:08 -0500 Subject: [PATCH 17/60] Simd.h + BatchAccessor: scalar_traits, uniform shifts, SWAR cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Simd.h: - Add scalar_traits / scalar_traits_t: extracts element type from plain scalars (identity) and Simd (T); used by BatchAccessor static_asserts and as the shift-count type for the new uniform-shift ops. - Add Simd::operator<<(T) / operator>>(T): uniform scalar shift (all lanes by the same immediate). Maps to vpsllw imm8 / vpsrlw imm8 on x86 — distinguished from the existing per-lane Simd<) so the code is correct for any unsigned 16+-bit instantiation, not just Simd. - Add class-level static_asserts (unsigned + sizeof >= 2) with explanatory messages referencing the SWAR carry-detection contract. - Remove static_assert(LaneWidth >= 16): was a performance aspiration, not a correctness requirement; SWAR works for any LaneWidth >= 1. - Use 'auto' for expanded / packed_lc / packed_sum (type already expressed by the initializer); keep explicit uint32_t for hor_or/hor_and/s where the width is a deliberate semantic choice. - Replace kMask15 hex literal 0x71C7u with 0b111'000'111'000'111u (binary makes the three 3-bit mask fields and three 3-bit gaps visually explicit). - Use vo << VoxelOffsetScalarT(9) (uniform shift) instead of vo << VoxelOffsetT(VoxelOffsetScalarT(9)) (broadcast-then-per-lane). Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Efty Sifakis --- .../BatchAccessor.h | 69 +++++++++++-------- simd_test/Simd.h | 25 +++++++ 2 files changed, 65 insertions(+), 29 deletions(-) diff --git a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.h b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.h index f2b3f18253..9b9654dfcf 100644 --- a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.h +++ b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.h @@ -40,9 +40,10 @@ #pragma once #include -#include // simd_traits, Simd, SimdMask +#include // simd_traits, scalar_traits, Simd, SimdMask #include #include +#include namespace nanovdb { @@ -75,6 +76,18 @@ class BatchAccessor static_assert(Val_traits::width == 1 || Val_traits::width == VO_traits::width, "BatchAccessor: ValueT lane width must be 1 (scalar) or match VoxelOffsetT"); + // The SWAR packed layout in prefetch occupies bits 0–14 of each element + // (max packed value 0x71C7, max sum 0xE38E). The element type must therefore + // be an unsigned integer of at least 16 bits; signed types produce UB on + // carry overflow, and 8-bit types cannot hold the packed fields. + using VoxelOffsetScalarT = util::scalar_traits_t; + static_assert(std::is_unsigned_v, + "BatchAccessor: VoxelOffsetT element type must be unsigned " + "(SWAR carry detection requires wrap-around, not signed overflow)"); + static_assert(sizeof(VoxelOffsetScalarT) >= 2, + "BatchAccessor: VoxelOffsetT element type must be at least 16 bits " + "(SWAR packed layout occupies bits 0-14, max sum 0xE38E)"); + public: // ------------------------------------------------------------------------- // Direction encoding @@ -185,48 +198,46 @@ class BatchAccessor // For axis-aligned WENO5 taps (one nonzero component) there is no over-probing. // ----------------------------------------------------------------------- - // Use VoxelOffsetT (Simd) directly for the packed - // arithmetic: 16 × uint16_t = 256 bits = one YMM → one vpaddw, vs two - // vpaddd if we widened to uint32_t. All intermediate values fit in uint16_t: - // packed_lc ≤ 0x71C7, packed_d ≤ 0x71C7, sum ≤ 0xE38E < 0xFFFF. - static_assert(LaneWidth >= 16, - "BatchAccessor::prefetch SWAR requires LaneWidth >= 16"); + // Use VoxelOffsetT directly for the packed arithmetic: LaneWidth elements + // of VoxelOffsetScalarT in one register → one vpaddw (16-bit) or vpaddd + // (32-bit) depending on the instantiation. All intermediate values fit: + // packed_lc ≤ 0x71C7, packed_d ≤ 0x71C7, sum ≤ 0xE38E < 2^16. // Compile-time packed stencil offset (3-bit two's complement per axis). - // Using uint16_t arithmetic: dk & 7u fits in 3 bits, shifted into position. - static constexpr uint16_t packed_d = - static_cast( - (uint32_t(unsigned(dk) & 7u)) - | (uint32_t(unsigned(di) & 7u) << 6) - | (uint32_t(unsigned(dj) & 7u) << 12)); - - // Sentinel for inactive lanes: lc = (4,4,4) → packed = 4|(4<<6)|(4<<12) = 0x4104. - // Note: expandVoxelOffset(kInactiveVoxelOffset=292) = 0x4104 = kSentinel15. - // So even unconditionally expanded inactive-lane values would yield the correct - // sentinel. However, straddle lanes carry arbitrary vo values (from the next - // leaf), so we must apply leafMask before the add to avoid false crossing signals. - static constexpr uint16_t kSentinel15 = - static_cast(4u | (4u << 6u) | (4u << 12u)); - static constexpr uint16_t kMask15 = uint16_t(0x71C7u); + // d & 7u gives the 3-bit representation; for negative d, d & 7 = 8+d. + static constexpr auto packed_d = + static_cast( + (unsigned(dk) & 7u) + | ((unsigned(di) & 7u) << 6) + | ((unsigned(dj) & 7u) << 12)); + + // Sentinel for inactive lanes: lc = (4,4,4) → packed = 4|(4<<6)|(4<<12). + // Note: expandVoxelOffset(kInactiveVoxelOffset=292) = kSentinel15, so even + // unconditionally expanded inactive-lane vo values yield the sentinel. + // However, straddle lanes carry arbitrary vo from the next leaf, so we + // must apply leafMask before the add to avoid false crossing signals. + static constexpr auto kSentinel15 = + static_cast(4u | (4u << 6u) | (4u << 12u)); + static constexpr auto kMask15 = + static_cast(0b111'000'111'000'111u); // Expand the 9-bit voxel offset into the 15-bit SWAR packed form — // one vpor + vpsllw + vpand (no scalar loop). // bits [0:2] = lz, bits [6:8] = lx, bits [12:14] = ly // Then blend: active lanes → expanded form, straddle/inactive → sentinel. // util::where accepts SimdMask for any U (heterogeneous overload). - const VoxelOffsetT expanded = - (vo | (vo << VoxelOffsetT(uint16_t(9)))) & VoxelOffsetT(kMask15); - const VoxelOffsetT packed_lc = + const auto expanded = + (vo | (vo << VoxelOffsetScalarT(9))) & VoxelOffsetT(kMask15); + const auto packed_lc = util::where(leafMask, expanded, VoxelOffsetT(kSentinel15)); - // One SIMD add across all LaneWidth lanes (one vpaddw YMM instruction). - const VoxelOffsetT packed_sum = packed_lc + VoxelOffsetT(packed_d); + // One SIMD add across all LaneWidth lanes (one vpaddw/vpaddd instruction). + const auto packed_sum = packed_lc + VoxelOffsetT(packed_d); // Horizontal reductions: widen to uint32_t for the carry-bit checks. uint32_t hor_or = 0u, hor_and = ~0u; for (int i = 0; i < LaneWidth; ++i) { - const uint32_t s = static_cast( - static_cast(VO_traits::get(packed_sum, i))); + const uint32_t s = static_cast(VO_traits::get(packed_sum, i)); hor_or |= s; hor_and &= s; } diff --git a/simd_test/Simd.h b/simd_test/Simd.h index a757efe003..567a3511b1 100644 --- a/simd_test/Simd.h +++ b/simd_test/Simd.h @@ -210,12 +210,20 @@ struct Simd { NANOVDB_SIMD_HOSTDEV Simd operator^(Simd o) const { Simd r; for (int i = 0; i < W; i++) r.data[i] = data[i] ^ o.data[i]; return r; } + // Per-lane variable shift (shift count from corresponding lane of o). NANOVDB_SIMD_HOSTDEV Simd operator<<(Simd o) const { Simd r; for (int i = 0; i < W; i++) r.data[i] = data[i] << o.data[i]; return r; } NANOVDB_SIMD_HOSTDEV Simd operator>>(Simd o) const { Simd r; for (int i = 0; i < W; i++) r.data[i] = data[i] >> o.data[i]; return r; } + // Uniform shift: all lanes shifted by the same scalar count (vpsllw imm8 / vpsrlw imm8). + NANOVDB_SIMD_HOSTDEV Simd operator<<(T shift) const { + Simd r; for (int i = 0; i < W; i++) r.data[i] = data[i] << shift; return r; + } + NANOVDB_SIMD_HOSTDEV Simd operator>>(T shift) const { + Simd r; for (int i = 0; i < W; i++) r.data[i] = data[i] >> shift; return r; + } }; template NANOVDB_SIMD_HOSTDEV @@ -338,6 +346,23 @@ struct simd_traits> { NANOVDB_SIMD_HOSTDEV static void set(SimdMask& m, int i, bool v) { m[i] = v; } }; +// --------------------------------------------------------------------------- +// scalar_traits — extract the scalar element type from T or Simd. +// +// Primary template: a plain scalar type is its own element type. +// The = void default parameter reserves a slot for enable_if specialisations. +// Specialisation for Simd: the element type is T. +// scalar_traits_t is a convenience alias for typename scalar_traits::type. +// --------------------------------------------------------------------------- +template +struct scalar_traits { using type = T; }; + +template +struct scalar_traits> { using type = T; }; + +template +using scalar_traits_t = typename scalar_traits::type; + // --------------------------------------------------------------------------- // to_bitmask — fold SimdMask into a uint32_t (one bit per lane). // T is the associated element type; only W matters. Requires W <= 32. From 515be21e6a118e7434ae6b4eb1cb066d0ba2c1a9 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Thu, 16 Apr 2026 10:33:39 -0500 Subject: [PATCH 18/60] Simd.h + BatchAccessor + Util.h: 2-arg where, util::reduce, countTrailingZeros - Add 2-argument where(mask, target) = value proxy (stdx and array backends): stdx-style masked assignment; encourages GCC to emit vpblendvb - Add util::reduce(v, op) to both backends: tree-reduces to a scalar with std::bit_or<>{} / std::bit_and<>{} etc.; replaces scalar horizontal loop - Add scalar reduce(T, BinaryOp) identity overload for W=1 path - Add util::countTrailingZeros(uint32_t) to nanovdb/util/Util.h: __hostdev__, CUDA/HIP/__builtin_ctz/MSVC/De Bruijn dispatch; removes ad-hoc ctz from Simd.h - BatchAccessor: use 2-arg where for packed_lc blend, util::reduce for hor_or/and, util::countTrailingZeros in toProbe loop, hoist root ref out of loop body Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Efty Sifakis --- .../BatchAccessor.h | 32 +++++----- nanovdb/nanovdb/util/Util.h | 35 +++++++++++ simd_test/Simd.h | 59 +++++++++++++++++++ 3 files changed, 111 insertions(+), 15 deletions(-) diff --git a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.h b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.h index 9b9654dfcf..813e19f130 100644 --- a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.h +++ b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.h @@ -43,6 +43,7 @@ #include // simd_traits, scalar_traits, Simd, SimdMask #include #include +#include #include namespace nanovdb { @@ -225,22 +226,20 @@ class BatchAccessor // one vpor + vpsllw + vpand (no scalar loop). // bits [0:2] = lz, bits [6:8] = lx, bits [12:14] = ly // Then blend: active lanes → expanded form, straddle/inactive → sentinel. - // util::where accepts SimdMask for any U (heterogeneous overload). + // util::where(mask, target) = value uses the stdx-style 2-argument proxy: + // packed_lc is pre-initialised to kSentinel15; active lanes are overwritten + // with expanded. This form may emit vpblendvb more reliably under GCC. const auto expanded = (vo | (vo << VoxelOffsetScalarT(9))) & VoxelOffsetT(kMask15); - const auto packed_lc = - util::where(leafMask, expanded, VoxelOffsetT(kSentinel15)); + auto packed_lc = VoxelOffsetT(kSentinel15); + util::where(leafMask, packed_lc) = expanded; // One SIMD add across all LaneWidth lanes (one vpaddw/vpaddd instruction). const auto packed_sum = packed_lc + VoxelOffsetT(packed_d); - // Horizontal reductions: widen to uint32_t for the carry-bit checks. - uint32_t hor_or = 0u, hor_and = ~0u; - for (int i = 0; i < LaneWidth; ++i) { - const uint32_t s = static_cast(VO_traits::get(packed_sum, i)); - hor_or |= s; - hor_and &= s; - } + // Horizontal reductions for the carry-bit checks. + const auto hor_or = util::reduce(packed_sum, std::bit_or<>{}); + const auto hor_and = util::reduce(packed_sum, std::bit_and<>{}); // Per-axis may-cross flags: compile-time dispatch on sign of d. bool x_cross = false, y_cross = false, z_cross = false; @@ -269,11 +268,14 @@ class BatchAccessor // Probe neighbor directions not already cached. // Every direction here requires probeLeaf (center is pre-populated, never in toProbe). uint32_t toProbe = neededMask & ~mProbedMask; - while (toProbe) { - const int d = __builtin_ctz(toProbe); - mLeafNeighbors[d] = mGrid.tree().root().probeLeaf(originForDir(d)); - mProbedMask |= (1u << d); - toProbe &= toProbe - 1; + if (toProbe) { + const auto& root = mGrid.tree().root(); + do { + const int d = static_cast(util::countTrailingZeros(toProbe)); + mLeafNeighbors[d] = root.probeLeaf(originForDir(d)); + mProbedMask |= (1u << d); + toProbe &= toProbe - 1; + } while (toProbe); } } diff --git a/nanovdb/nanovdb/util/Util.h b/nanovdb/nanovdb/util/Util.h index bdff640a97..2c61a205f8 100644 --- a/nanovdb/nanovdb/util/Util.h +++ b/nanovdb/nanovdb/util/Util.h @@ -609,6 +609,41 @@ __hostdev__ inline uint32_t findLowestOn(uint64_t v) #endif }// util::findLowestOn(uint64_t) +// -------------------> countTrailingZeros <---------------------------- + +/// @brief Returns the number of trailing zero bits in the specified 32 bit word, +/// i.e. the index of the lowest set bit. +/// +/// @warning Assumes that at least one bit is set in the word, i.e. @a v != uint32_t(0)! +NANOVDB_HOSTDEV_DISABLE_WARNING +__hostdev__ inline uint32_t countTrailingZeros(uint32_t v) +{ + NANOVDB_ASSERT(v); +#if (defined(__CUDA_ARCH__) || defined(__HIP__)) && defined(NANOVDB_USE_INTRINSICS) + return __ffs(v) - 1; // one based indexing +#elif defined(_MSC_VER) && defined(NANOVDB_USE_INTRINSICS) + unsigned long index; + _BitScanForward(&index, v); + return static_cast(index); +#elif (defined(__GNUC__) || defined(__clang__)) && defined(NANOVDB_USE_INTRINSICS) + return static_cast(__builtin_ctz(v)); +#else + //NANO_WARNING("Using software implementation for util::countTrailingZeros(uint32_t v)") + static const unsigned char DeBruijn[32] = { + 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9}; +// disable unary minus on unsigned warning +#if defined(_MSC_VER) && !defined(__NVCC__) +#pragma warning(push) +#pragma warning(disable : 4146) +#endif + return DeBruijn[uint32_t((v & -v) * 0x077CB531U) >> 27]; +#if defined(_MSC_VER) && !defined(__NVCC__) +#pragma warning(pop) +#endif + +#endif +}// util::countTrailingZeros(uint32_t) + // -------------------> findHighestOn <---------------------------- /// @brief Returns the index of the highest, i.e. most significant, on bit in the specified 32 bit word diff --git a/simd_test/Simd.h b/simd_test/Simd.h index 567a3511b1..6cea2fe50e 100644 --- a/simd_test/Simd.h +++ b/simd_test/Simd.h @@ -94,6 +94,34 @@ inline Simd where(SimdMask mask, Simd a, Simd b) { return result; } +// 2-argument where: stdx-style masked-assignment proxy. +// where(mask, target) = value writes value[i] into target[i] for lanes where mask[i] is true. +// Heterogeneous mask (mask element type U may differ from value element type T). +// stdx::fixed_size_simd operator[] returns by value, so the assignment delegates to +// a boolean round-trip + stdx::where rather than a per-lane scalar store. +template +struct WhereExpression { + const SimdMask& mask; + Simd& target; + WhereExpression& operator=(const Simd& value) { + bool arr[W]; + for (int i = 0; i < W; ++i) arr[i] = static_cast(mask[i]); + SimdMask tmask(arr, element_aligned); + stdx::where(tmask, target) = value; + return *this; + } +}; +template +inline WhereExpression where(const SimdMask& mask, Simd& target) { + return {mask, target}; +} + +// Horizontal reduction: delegates to stdx::reduce. +// Mirrors std::experimental::reduce(v, binary_op) — same signature, same semantics. +// Use with std::bit_or<>{}, std::bit_and<>{}, std::plus<>{}, etc. +template +inline T reduce(Simd v, BinaryOp op) { return stdx::reduce(v, op); } + template inline bool any_of(SimdMask m) { return stdx::any_of(m); } template @@ -262,6 +290,34 @@ NANOVDB_SIMD_HOSTDEV Simd where(SimdMask mask, Simd a, Simd Simd r; for (int i = 0; i < W; i++) r[i] = mask[i] ? a[i] : b[i]; return r; } +// 2-argument where: stdx-style masked-assignment proxy. +// where(mask, target) = value writes value[i] into target[i] for lanes where mask[i] is true. +// Heterogeneous mask (mask element type U may differ from value element type T). +template +struct WhereExpression { + const SimdMask& mask; + Simd& target; + NANOVDB_SIMD_HOSTDEV WhereExpression& operator=(const Simd& value) { + for (int i = 0; i < W; ++i) + if (mask[i]) target[i] = value[i]; + return *this; + } +}; +template +NANOVDB_SIMD_HOSTDEV WhereExpression where(const SimdMask& mask, Simd& target) { + return {mask, target}; +} + +// Horizontal reduction: fold all lanes with a binary operator. +// Mirrors std::experimental::reduce(v, binary_op). +// Use with std::bit_or<>{}, std::bit_and<>{}, std::plus<>{}, etc. +template +NANOVDB_SIMD_HOSTDEV T reduce(Simd v, BinaryOp op) { + T r = v[0]; + for (int i = 1; i < W; ++i) r = op(r, v[i]); + return r; +} + template NANOVDB_SIMD_HOSTDEV bool any_of(SimdMask m) { bool r = false; for (int i = 0; i < W; i++) r |= m[i]; return r; @@ -363,6 +419,7 @@ struct scalar_traits> { using type = T; }; template using scalar_traits_t = typename scalar_traits::type; + // --------------------------------------------------------------------------- // to_bitmask — fold SimdMask into a uint32_t (one bit per lane). // T is the associated element type; only W matters. Requires W <= 32. @@ -381,6 +438,8 @@ NANOVDB_SIMD_HOSTDEV uint32_t to_bitmask(SimdMask m) { template NANOVDB_SIMD_HOSTDEV T min(T a, T b) { return a < b ? a : b; } template NANOVDB_SIMD_HOSTDEV T max(T a, T b) { return a > b ? a : b; } template NANOVDB_SIMD_HOSTDEV T where(bool m, T a, T b) { return m ? a : b; } +template +NANOVDB_SIMD_HOSTDEV T reduce(T v, BinaryOp) { return v; } } // namespace util } // namespace nanovdb From b15d2a0a478c9f78906636bdcfe62109ff9effa2 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Thu, 16 Apr 2026 12:37:52 -0500 Subject: [PATCH 19/60] BatchAccessor: SIMD gather chain for cachedGetValue ingredient fetch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the per-lane scalar loop in cachedGetValue with a fully SIMD gather chain that populates offsets, prefixSums, and maskWords without any scalar iteration over lanes. Pipeline (all in Simd): packed_sum (uint16_t) → ×1129 → >>10 → &31 : d_vec (0..26), stays uint16_t — bits [10:14] of product lie below bit 16 so modular uint16_t multiply is exact → gather(mNeighborLeafIDs) : leaf_id_vec (uint32_t) → ×kStride : raw_idx (int32_t, null lanes → 0) → gather(offset_base) : mOffset per lane → gather(prefix_base) : mPrefixSum packed, then shift-extract field w → gather(mask_word_base + w) : valueMask().words()[w] per lane Switch mLeafNeighbors[27] (const LeafT*) to mNeighborLeafIDs[27] (uint32_t) with kNullLeafID = ~uint32_t(0) sentinel, enabling the flat-base SIMD gather pattern. prefetch and advance updated accordingly. Add simd_cast(Simd) to Simd.h for widening (uint16_t → int32_t, uint16_t → uint64_t, uint32_t → int32_t) used in gather index construction. Debug cross-check (#ifndef NDEBUG) validates all three vectors against the scalar reference path; 12M+ lane checks pass. Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Efty Sifakis --- .../BatchAccessor.h | 331 ++++++++++++++---- .../BatchAccessor.md | 139 ++++++-- simd_test/Simd.h | 20 ++ 3 files changed, 387 insertions(+), 103 deletions(-) diff --git a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.h b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.h index 813e19f130..024d0e747c 100644 --- a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.h +++ b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.h @@ -78,7 +78,7 @@ class BatchAccessor "BatchAccessor: ValueT lane width must be 1 (scalar) or match VoxelOffsetT"); // The SWAR packed layout in prefetch occupies bits 0–14 of each element - // (max packed value 0x71C7, max sum 0xE38E). The element type must therefore + // (max packed value 0x1CE7, max sum 0x4A52). The element type must therefore // be an unsigned integer of at least 16 bits; signed types produce UB on // carry overflow, and 8-bit types cannot hold the packed fields. using VoxelOffsetScalarT = util::scalar_traits_t; @@ -87,33 +87,36 @@ class BatchAccessor "(SWAR carry detection requires wrap-around, not signed overflow)"); static_assert(sizeof(VoxelOffsetScalarT) >= 2, "BatchAccessor: VoxelOffsetT element type must be at least 16 bits " - "(SWAR packed layout occupies bits 0-14, max sum 0xE38E)"); + "(SWAR packed layout occupies bits 0-14, max sum 0x4A52)"); public: // ------------------------------------------------------------------------- // Direction encoding // - // bit(dx,dy,dz) = (dx+1)*9 + (dy+1)*3 + (dz+1), dx,dy,dz ∈ {-1,0,+1} + // dir(dx,dy,dz) = (dx+1)*9 + (dy+1)*3 + (dz+1), dx,dy,dz ∈ {-1,0,+1} // // Selected entries: - // dir( 0, 0, 0) = 13 — center leaf (mLeafNeighbors[13]) + // dir( 0, 0, 0) = 13 — center leaf (mNeighborLeafIDs[13]) // dir(-1, 0, 0) = 4 — x-minus face // dir(+1, 0, 0) = 22 — x-plus face // dir( 0,-1, 0) = 10 — y-minus face // dir( 0,+1, 0) = 16 — y-plus face // dir( 0, 0,-1) = 12 — z-minus face // dir( 0, 0,+1) = 14 — z-plus face + // + // Sentinel leaf ID for directions outside the narrow band (no leaf exists). // ------------------------------------------------------------------------- - static constexpr int dir(int dx, int dy, int dz) + static constexpr int dir(int dx, int dy, int dz) { return (dx + 1) * 9 + (dy + 1) * 3 + (dz + 1); } + static constexpr uint32_t kNullLeafID = ~uint32_t(0); // ------------------------------------------------------------------------- // Construction // - // Eagerly populates mLeafNeighbors[dir(0,0,0)] (the center pointer) and - // marks bit 13 in mProbedMask. The center pointer is O(1) to compute + // Eagerly populates mNeighborLeafIDs[dir(0,0,0)] (the center leaf ID) and + // marks bit 13 in mProbedMask. The center ID is O(1) to compute // (no probeLeaf needed), so there is no reason to defer it. // // Consequence: cachedGetValue<0,0,0> is valid immediately after construction @@ -127,23 +130,26 @@ class BatchAccessor , mCenterOrigin(grid.tree().getFirstLeaf()[firstLeafID].origin()) , mProbedMask(1u << dir(0, 0, 0)) { - for (auto& p : mLeafNeighbors) p = nullptr; - mLeafNeighbors[dir(0, 0, 0)] = &mGrid.tree().getFirstLeaf()[mCenterLeafID]; + for (auto& id : mNeighborLeafIDs) id = kNullLeafID; + mNeighborLeafIDs[dir(0, 0, 0)] = mCenterLeafID; } // ------------------------------------------------------------------------- // advance — move to a new center leaf // // Call when none_of(leafMask): all active lanes have moved past mCenterLeafID. - // Repopulates the center pointer eagerly and resets mProbedMask to bit 13, - // so stale neighbor entries are blocked and the center is immediately valid. + // Resets all neighbor IDs to kNullLeafID, repopulates the center eagerly, + // and resets mProbedMask to bit 13 so the center is immediately valid. + // Resetting all 27 IDs (108 bytes) ensures mNeighborLeafIDs[d] == kNullLeafID + // iff bit d is absent from mProbedMask — a clean invariant for SIMD gather. // ------------------------------------------------------------------------- void advance(uint32_t newLeafID) { - mCenterLeafID = newLeafID; - mCenterOrigin = mGrid.tree().getFirstLeaf()[newLeafID].origin(); - mLeafNeighbors[dir(0,0,0)] = &mGrid.tree().getFirstLeaf()[newLeafID]; - mProbedMask = (1u << dir(0, 0, 0)); + mCenterLeafID = newLeafID; + mCenterOrigin = mGrid.tree().getFirstLeaf()[newLeafID].origin(); + for (auto& id : mNeighborLeafIDs) id = kNullLeafID; + mNeighborLeafIDs[dir(0, 0, 0)] = newLeafID; + mProbedMask = (1u << dir(0, 0, 0)); } // ------------------------------------------------------------------------- @@ -169,86 +175,96 @@ class BatchAccessor // Replace the scalar per-lane loop with a single SIMD add + two horizontal // reductions, using a 15-bit packed coordinate representation. // - // packed_lc layout (one group per axis, zero-guard gaps): - // bits 0– 2: lz carry exits at bit 3 (z-axis crossing) - // bits 6– 8: lx carry exits at bit 9 (x-axis crossing) - // bits 12–14: ly carry exits at bit 15 (y-axis crossing) + // packed_lc layout — 5-bit groups, tightly packed, no inter-group gaps: + // bits 0– 2: lz carry region bits 3–4 (z-axis) + // bits 5– 7: ly carry region bits 8–9 (y-axis) + // bits 10–12: lx carry region bits 13–14 (x-axis) + // + // All carry bits land within [0:14], fitting cleanly in uint16_t with + // bit 15 unused. The z,y,x ordering matches the weight sequence in + // dir(): (dz+1)×1 + (dy+1)×3 + (dx+1)×9. + // + // packed_tap = stencil offsets biased by +8, placed in the same groups: + // (dk+8) at bits [0:...] dk+8 ∈ [5,11] for dk ∈ [-3,3] + // (dj+8) at bits [5:...] + // (di+8) at bits [10:...] // - // This is expandVoxelOffset() steps 1+2 only (no step 3), because for a - // fixed (di,dj,dk) each axis has exactly one possible crossing direction, - // so we need only one group per axis rather than two. + // The +8 bias shifts the zero point so that the per-group sum + // s = lc + (d+8), lc ∈ [0,7], d ∈ [-3,3] → s ∈ [5,18] + // encodes the neighbor coordinate measured from the (-1,-1,-1) leaf: + // s ∈ [ 5, 7]: component + d < 0 → lo-neighbor (d < 0 case) + // s ∈ [ 8,15]: component + d ∈ [0,7] → center leaf + // s ∈ [16,18]: component + d ≥ 8 → hi-neighbor (d > 0 case) // - // packed_d = 3-bit two's complement of each offset placed in the same groups: - // dk & 7 at bits [0:2] (= 8+dk for dk<0, dk for dk>=0) - // di & 7 at bits [6:8] - // dj & 7 at bits [12:14] + // Carry bits after add: + // bit[+3] SET ↔ s ≥ 8 (= no lo-crossing) + // bit[+4] SET ↔ s ≥ 16 (= hi-crossing) // - // After SIMD add(packed_lc, packed_d): - // carry at bit 3 SET ↔ lz + dk ≥ 8 ↔ hi-z crossing (dk > 0) - // carry at bit 3 CLEAR ↔ lz + dk < 0 ↔ lo-z crossing (dk < 0) - // (same logic for x@bit9 and y@bit15) + // For prefetch, only one bit per axis is needed (compile-time dispatch): + // dk > 0: z_cross = hor_or & (1 << 4) — any lane has hi-z carry + // dk < 0: z_cross = !(hor_and & (1 << 3)) — any lane lacks lo-z guard + // (same at bits [9]/[8] for y, bits [14]/[13] for x) // - // Inactive lanes carry the sentinel (lc = 4 per axis), which satisfies - // |d| ≤ 4: never fires a false hi-carry, never clears a lo-carry. + // Inactive lanes carry sentinel lc = 4 per axis: s = d+12 ∈ [9,15] + // → bit[+3]=1, bit[+4]=0 → no crossing signal regardless of d. ✓ // - // For multi-axis taps (more than one nonzero component), the per-axis - // may-cross flags are combined conservatively: if two axes can independently - // cross, the edge/corner direction combining both is also added to neededMask. - // This may over-probe (extra probeLeaf if no single lane crosses both axes - // simultaneously) but never misses a direction any lane actually needs. - // For axis-aligned WENO5 taps (one nonzero component) there is no over-probing. + // For multi-axis taps, may-cross flags are combined conservatively. // ----------------------------------------------------------------------- // Use VoxelOffsetT directly for the packed arithmetic: LaneWidth elements // of VoxelOffsetScalarT in one register → one vpaddw (16-bit) or vpaddd // (32-bit) depending on the instantiation. All intermediate values fit: - // packed_lc ≤ 0x71C7, packed_d ≤ 0x71C7, sum ≤ 0xE38E < 2^16. + // packed_lc ≤ 0x1CE7, packed_tap ≤ 0x2D6B, sum ≤ 0x4A52 < 2^16. - // Compile-time packed stencil offset (3-bit two's complement per axis). - // d & 7u gives the 3-bit representation; for negative d, d & 7 = 8+d. - static constexpr auto packed_d = + // Compile-time packed stencil offset (+8-biased per axis, 5-bit groups). + static constexpr auto packed_tap = static_cast( - (unsigned(dk) & 7u) - | ((unsigned(di) & 7u) << 6) - | ((unsigned(dj) & 7u) << 12)); - - // Sentinel for inactive lanes: lc = (4,4,4) → packed = 4|(4<<6)|(4<<12). - // Note: expandVoxelOffset(kInactiveVoxelOffset=292) = kSentinel15, so even - // unconditionally expanded inactive-lane vo values yield the sentinel. - // However, straddle lanes carry arbitrary vo from the next leaf, so we - // must apply leafMask before the add to avoid false crossing signals. + (unsigned(dk) + 8u) + | ((unsigned(dj) + 8u) << 5) + | ((unsigned(di) + 8u) << 10)); + + // Sentinel for inactive lanes: lc = (4,4,4) → packed = 4|(4<<5)|(4<<10). + // Straddle lanes carry arbitrary vo from the next leaf, so we must apply + // leafMask before the add to avoid false crossing signals. static constexpr auto kSentinel15 = - static_cast(4u | (4u << 6u) | (4u << 12u)); + static_cast(4u | (4u << 5u) | (4u << 10u)); + // Data mask: keeps bits [0:2], [5:7], [10:12] — the three 3-bit data fields. static constexpr auto kMask15 = - static_cast(0b111'000'111'000'111u); + static_cast(0b111'00'111'00'111u); - // Expand the 9-bit voxel offset into the 15-bit SWAR packed form — - // one vpor + vpsllw + vpand (no scalar loop). - // bits [0:2] = lz, bits [6:8] = lx, bits [12:14] = ly - // Then blend: active lanes → expanded form, straddle/inactive → sentinel. - // util::where(mask, target) = value uses the stdx-style 2-argument proxy: - // packed_lc is pre-initialised to kSentinel15; active lanes are overwritten - // with expanded. This form may emit vpblendvb more reliably under GCC. + // Expand the 9-bit voxel offset into the 15-bit SWAR packed form. + // vo = lx[6:8] | ly[3:5] | lz[0:2] (NanoVDB leaf layout) + // target: lx[10:12] | ly[5:7] | lz[0:2] + // + // (vo | (vo<<4)) & 0x1C07 places lz (stays at [0:2]) and lx ([6:8]→[10:12]) + // in one OR+mask; (vo<<2) & 0xE0 moves ly ([3:5]→[5:7]). const auto expanded = - (vo | (vo << VoxelOffsetScalarT(9))) & VoxelOffsetT(kMask15); + ((vo | (vo << VoxelOffsetScalarT(4))) & VoxelOffsetT(0x1C07u)) + | ((vo << VoxelOffsetScalarT(2)) & VoxelOffsetT(0xE0u)); + + // Blend: active lanes → expanded form, straddle/inactive → sentinel. + // util::where(mask, target) = value uses the stdx-style 2-argument proxy: + // packed_lc is pre-initialised to kSentinel15; active lanes are overwritten. auto packed_lc = VoxelOffsetT(kSentinel15); util::where(leafMask, packed_lc) = expanded; // One SIMD add across all LaneWidth lanes (one vpaddw/vpaddd instruction). - const auto packed_sum = packed_lc + VoxelOffsetT(packed_d); + const auto packed_sum = packed_lc + VoxelOffsetT(packed_tap); // Horizontal reductions for the carry-bit checks. const auto hor_or = util::reduce(packed_sum, std::bit_or<>{}); const auto hor_and = util::reduce(packed_sum, std::bit_and<>{}); // Per-axis may-cross flags: compile-time dispatch on sign of d. + // Overflow (d>0): detected by the hi-carry bit (+4 from group base). + // Underflow (d<0): detected by absence of the lo-guard bit (+3). bool x_cross = false, y_cross = false, z_cross = false; - if constexpr (di > 0) x_cross = bool(hor_or & (1u << 9)); - if constexpr (di < 0) x_cross = !bool(hor_and & (1u << 9)); - if constexpr (dj > 0) y_cross = bool(hor_or & (1u << 15)); - if constexpr (dj < 0) y_cross = !bool(hor_and & (1u << 15)); - if constexpr (dk > 0) z_cross = bool(hor_or & (1u << 3)); + if constexpr (dk > 0) z_cross = bool(hor_or & (1u << 4)); if constexpr (dk < 0) z_cross = !bool(hor_and & (1u << 3)); + if constexpr (dj > 0) y_cross = bool(hor_or & (1u << 9)); + if constexpr (dj < 0) y_cross = !bool(hor_and & (1u << 8)); + if constexpr (di > 0) x_cross = bool(hor_or & (1u << 14)); + if constexpr (di < 0) x_cross = !bool(hor_and & (1u << 13)); // Compile-time crossing sign per axis. constexpr int sx = (di > 0) ? 1 : -1; // only used when di != 0 @@ -271,8 +287,11 @@ class BatchAccessor if (toProbe) { const auto& root = mGrid.tree().root(); do { - const int d = static_cast(util::countTrailingZeros(toProbe)); - mLeafNeighbors[d] = root.probeLeaf(originForDir(d)); + const int d = static_cast(util::countTrailingZeros(toProbe)); + const LeafT* leafPtr = root.probeLeaf(originForDir(d)); + mNeighborLeafIDs[d] = leafPtr + ? uint32_t(leafPtr - mGrid.tree().getFirstLeaf()) + : kNullLeafID; mProbedMask |= (1u << d); toProbe &= toProbe - 1; } while (toProbe); @@ -294,6 +313,172 @@ class BatchAccessor template void cachedGetValue(ValueT& result, VoxelOffsetT vo, PredicateT leafMask) const { + // ----------------------------------------------------------------------- + // SIMD ingredient fetch (WIP — not yet wired to result) + // + // Recomputes packed_sum (same SWAR expansion as prefetch) to extract the + // three per-lane ingredients needed to replace leaf->getValue() with fully + // SIMD index arithmetic + value gather. See BatchAccessor.md §8d. + // + // offsets — leaf->mOffset: base value index for the leaf + // prefixSums — leaf->mPrefixSum[w]: prefix popcount up to x-slice w + // maskWords — leaf->mMask.mWords[w]: uint64_t mask for x-slice w + // + // w = dest_x = bits [10:12] of packed_sum (NanoVDB leaf layout: x is + // the most significant axis, so x-slices index the eight uint64_t words). + // + // dir per lane is extracted via the base-32 multiply trick (§8d): + // v = (packed_sum & 0x6318u) >> 3 + // dir = (v * 1129u) >> 10 + // + // Note: exact field names (mOffset, mPrefixSum, mMask.mWords) need + // verification against LeafData in NanoVDB.h. + // ----------------------------------------------------------------------- + { + const auto expanded = + ((vo | (vo << VoxelOffsetScalarT(4))) & VoxelOffsetT(0x1C07u)) + | ((vo << VoxelOffsetScalarT(2)) & VoxelOffsetT(0x00E0u)); + + static constexpr auto packed_tap = + static_cast( + (unsigned(dk) + 8u) + | ((unsigned(dj) + 8u) << 5) + | ((unsigned(di) + 8u) << 10)); + static constexpr auto kSentinel15 = + static_cast(4u | (4u << 5u) | (4u << 10u)); + + auto packed_lc = VoxelOffsetT(kSentinel15); + util::where(leafMask, packed_lc) = expanded; + const auto packed_sum = packed_lc + VoxelOffsetT(packed_tap); + + // w per lane: dest_x = bits [10:12] → index of the uint64_t mask word + const auto w_vec = (packed_sum >> VoxelOffsetScalarT(10)) & VoxelOffsetT(7u); + + // SIMD gather of mOffset, mPrefixSum[w], and maskWords[w] per lane. + // + // Step 1 — d_vec: per-lane dir (0..26) via base-32 multiply trick (§8d). + // No widening needed: we extract bits [10:14] of (v * 1129). Those + // bits lie entirely below bit 16, so the modular uint16_t product gives + // the same answer as the full-width product for all valid + sentinel inputs. + // + // Step 2 — leaf_id_vec: gather mNeighborLeafIDs[d] for all lanes at once. + // + // Step 3 — raw_idx: leaf_id * (sizeof(LeafT)/sizeof(uint64_t)). + // This is the per-lane uint64_t-stride index into the flat leaf array, + // viewed as a uint64_t[] through the base pointer of the target field. + // Invalid (kNullLeafID) lanes are clamped to index 0 (safe; masked out). + // + // Step 4 — offsets / prefixSums: two gathers with different base pointers + // but the same raw_idx; masked to 0 for null lanes. + // mPrefixSum is a packed uint64_t: field w lives at bits [9*(w-1)+:9] + // (9-bit fields, w=0 → prefix = 0 by definition). + // + // Step 5 — maskWords: gather from valueMask().words() base. + // words()[wi] for leaf[leaf_id] = mask_word_base[leaf_id*kStride + wi]. + // The per-lane wi is added to raw_idx to form the mask gather index. + using U32T = util::Simd; + using U64T = util::Simd; + using U64Traits = util::simd_traits; + + // Step 1 — d_vec: per-lane dir (0..26) via base-32 multiply (§8d). + // Stay in uint16_t throughout: bits [10:14] of (v * 1129) are entirely + // within the lower 16 bits, so the modular uint16_t product gives the + // same result as the full-width product for all valid inputs. + const auto d_u16 = ((packed_sum & VoxelOffsetT(0x6318u)) + >> VoxelOffsetScalarT(3)) + * VoxelOffsetT(1129u) >> VoxelOffsetScalarT(10) + & VoxelOffsetT(31u); + const auto d_i32 = util::simd_cast(d_u16); + + // Step 2 — leaf IDs + const auto leaf_id_vec = util::gather(mNeighborLeafIDs, d_i32); // Simd + const auto valid_u32 = (leaf_id_vec != U32T(kNullLeafID)); // SimdMask + + // Step 3 — stride-scaled gather indices (null lanes → 0) + static constexpr uint32_t kStride = sizeof(LeafT) / sizeof(uint64_t); + const auto raw_idx = util::simd_cast( + util::where(valid_u32, leaf_id_vec * U32T(kStride), U32T(0))); + + // Step 4a — offsets (mOffset) + const uint64_t* offset_base = reinterpret_cast( + &mGrid.tree().getFirstLeaf()[0].data()->mOffset); + const U64T offsets = util::where(valid_u32, + util::gather(offset_base, raw_idx), U64T(0)); + + // Step 4b — prefixSums (mPrefixSum packed uint64_t, shift-extract field w) + const uint64_t* prefix_base = reinterpret_cast( + &mGrid.tree().getFirstLeaf()[0].data()->mPrefixSum); + const auto prefix_raw = util::gather(prefix_base, raw_idx); + const auto w_u64 = util::simd_cast(w_vec); + const auto nonzero_w = (w_u64 != U64T(0)); + const auto shift = util::where(nonzero_w, (w_u64 - U64T(1)) * U64T(9), U64T(0)); + const U64T prefixSums = util::where(valid_u32, + util::where(nonzero_w, (prefix_raw >> shift) & U64T(511u), U64T(0)), + U64T(0)); + + // Step 5 — maskWords (valueMask().words()[w]) + // mask_word_base[leaf_id*kStride + w] == leaf[leaf_id].valueMask().words()[w] + // because the mask field is at a fixed offsetof within every LeafT. + const uint64_t* mask_word_base = + mGrid.tree().getFirstLeaf()[0].valueMask().words(); + const auto w_i32 = util::simd_cast(util::simd_cast(w_vec)); + const auto mask_idx = raw_idx + w_i32; + const U64T maskWords = util::where(valid_u32, + util::gather(mask_word_base, mask_idx), U64T(0)); + // ------------------------------------------------------------------- + // Debug cross-check: validate SIMD-path values against scalar ref + // ------------------------------------------------------------------- +#ifndef NDEBUG + for (int i = 0; i < LaneWidth; ++i) { + if (!Pred_traits::get(leafMask, i)) continue; + + // Scalar reference: same arithmetic as the legacy loop below + const auto vo_i = static_cast(VO_traits::get(vo, i)); + const int lx = (vo_i >> 6) & 7, ly = (vo_i >> 3) & 7, lz = vo_i & 7; + const int nx = lx + di, ny = ly + dj, nz = lz + dk; + const int dx = (nx < 0) ? -1 : (nx >= 8) ? 1 : 0; + const int dy = (ny < 0) ? -1 : (ny >= 8) ? 1 : 0; + const int dz = (nz < 0) ? -1 : (nz >= 8) ? 1 : 0; + const int d_ref = dir(dx, dy, dz); + const int nx_w = nx - dx * 8; // = dest_x = word index w + const uint32_t ref_id = mNeighborLeafIDs[d_ref]; + const LeafT* ref = (ref_id != kNullLeafID) + ? &mGrid.tree().getFirstLeaf()[ref_id] : nullptr; + + // SIMD-path values for this lane + const uint32_t ps_i = static_cast(VO_traits::get(packed_sum, i)); + const int d_simd = int((((ps_i & 0x6318u) >> 3) * 1129u >> 10) & 31u); + const int wi = int(VO_traits::get(w_vec, i)); + + assert(d_simd == d_ref && "cachedGetValue SIMD: dir mismatch"); + assert(wi == nx_w && "cachedGetValue SIMD: w (dest_x) mismatch"); + + if (ref) { + const uint64_t pfx_ref = (uint32_t(nx_w) > 0u) + ? (ref->data()->mPrefixSum >> (9u * (uint32_t(nx_w) - 1u))) & 511u + : uint64_t(0); + assert(U64Traits::get(offsets, i) == ref->data()->mOffset + && "cachedGetValue SIMD: mOffset mismatch"); + assert(U64Traits::get(prefixSums, i) == pfx_ref + && "cachedGetValue SIMD: mPrefixSum mismatch"); + assert(U64Traits::get(maskWords, i) == ref->valueMask().words()[nx_w] + && "cachedGetValue SIMD: maskWord mismatch"); + } else { + assert(U64Traits::get(offsets, i) == uint64_t(0) + && "cachedGetValue SIMD: null leaf offsets should be 0"); + assert(U64Traits::get(prefixSums, i) == uint64_t(0) + && "cachedGetValue SIMD: null leaf prefixSums should be 0"); + assert(U64Traits::get(maskWords, i) == uint64_t(0) + && "cachedGetValue SIMD: null leaf maskWords should be 0"); + } + } +#endif + (void)offsets; (void)prefixSums; (void)maskWords; + } + + // ----------------------------------------------------------------------- + // Legacy scalar path — authoritative until SIMD path is wired in + // ----------------------------------------------------------------------- for (int i = 0; i < LaneWidth; ++i) { if (!Pred_traits::get(leafMask, i)) continue; const auto vo_i = static_cast(VO_traits::get(vo, i)); @@ -312,9 +497,11 @@ class BatchAccessor const uint32_t offset = uint32_t(nx_w) * 64u + uint32_t(ny_w) * 8u + uint32_t(nz_w); - const int d = dir(dx, dy, dz); + const int d = dir(dx, dy, dz); assert((mProbedMask & (1u << d)) && "cachedGetValue: direction not prefetched"); - const LeafT* leaf = mLeafNeighbors[d]; + const uint32_t leaf_id = mNeighborLeafIDs[d]; + const LeafT* leaf = (leaf_id != kNullLeafID) + ? &mGrid.tree().getFirstLeaf()[leaf_id] : nullptr; const ScalarValueT val = leaf ? static_cast(leaf->getValue(offset)) : ScalarValueT(0); @@ -337,7 +524,7 @@ class BatchAccessor uint32_t mCenterLeafID; Coord mCenterOrigin; uint32_t mProbedMask; - const LeafT* mLeafNeighbors[27]; + uint32_t mNeighborLeafIDs[27]; // kNullLeafID when not probed or outside narrow band }; } // namespace nanovdb diff --git a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.md b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.md index 1353ee0cee..58b463b6b8 100644 --- a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.md +++ b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.md @@ -67,11 +67,11 @@ which works for both scalar and vector types via specialisation. Four members persist across batches within one center leaf: ```cpp -const GridT& mGrid; // for probeLeaf calls via mGrid.tree() -uint32_t mCenterLeafID; // index of current center leaf -Coord mCenterOrigin; // world-space origin of current center leaf -uint32_t mProbedMask = (1u << 13); // bit 13 (center) pre-set at construction -const LeafT* mLeafNeighbors[27]; // [13] = center (eager); others: lazily probed +const GridT& mGrid; // for probeLeaf calls via mGrid.tree() +uint32_t mCenterLeafID; // index of current center leaf +Coord mCenterOrigin; // world-space origin of current center leaf +uint32_t mProbedMask = (1u << 13); // bit 13 (center) pre-set at construction +uint32_t mNeighborLeafIDs[27]; // kNullLeafID when outside narrow band or unprobed ``` **Direction encoding** (`dir` is a `static constexpr` member): @@ -80,28 +80,37 @@ const LeafT* mLeafNeighbors[27]; // [13] = center (eager); others: l dir(dx, dy, dz) = (dx+1)*9 + (dy+1)*3 + (dz+1) dx,dy,dz ∈ {-1,0,+1} ``` -`mLeafNeighbors[27]` is a flat array indexed by `dir(dx,dy,dz)`. -`mLeafNeighbors[13]` (= `dir(0,0,0)`) is the center leaf pointer. -`mLeafNeighbors[d]` is `nullptr` when the neighbor leaf lies outside the narrow band. +`mNeighborLeafIDs[27]` is a flat array indexed by `dir(dx,dy,dz)`. +`mNeighborLeafIDs[13]` (= `dir(0,0,0)`) holds the center leaf ID. +`mNeighborLeafIDs[d] = kNullLeafID` when the neighbor lies outside the narrow band or +has not yet been probed. -**Why pointers, not leaf IDs:** `cachedGetValue` accesses the leaf data array for -every active lane in every batch. Storing `const LeafT*` avoids a `base + id * -sizeof(LeafT)` multiply on every call; `nullptr` is a natural "outside narrow band" -sentinel. `NanoVDB::ReadAccessor` uses the same approach for its cached node pointers. +```cpp +static constexpr uint32_t kNullLeafID = ~uint32_t(0); +``` + +**Why leaf IDs, not pointers:** `cachedGetValue` fetches `mOffset`, `mPrefixSum`, and +`valueMask().words()[w]` for all active lanes via SIMD gathers (§8d). The gather index +is `leaf_id × (sizeof(LeafT)/sizeof(uint64_t))`, computed once per call as a +`Simd` multiply. Storing IDs enables a single flat-base gather over the +contiguous leaf array; storing pointers would require per-lane pointer arithmetic that +doesn't map to `vgatherdpd` / `vpgatherqq`. The `kNullLeafID` sentinel cleanly +replaces `nullptr` and is masked out in the gather via `where`. **Cache advance:** when `none_of(leafMask)` fires in the outer loop: ```cpp void advance(uint32_t newLeafID) { - mCenterLeafID = newLeafID; - mCenterOrigin = mGrid.tree().getFirstLeaf()[newLeafID].origin(); - mLeafNeighbors[dir(0,0,0)] = &mGrid.tree().getFirstLeaf()[newLeafID]; - mProbedMask = (1u << dir(0,0,0)); // center pre-set; neighbors stale + mCenterLeafID = newLeafID; + mCenterOrigin = mGrid.tree().getFirstLeaf()[newLeafID].origin(); + for (auto& id : mNeighborLeafIDs) id = kNullLeafID; + mNeighborLeafIDs[dir(0, 0, 0)] = newLeafID; + mProbedMask = (1u << dir(0, 0, 0)); } ``` -Stale neighbor entries in `mLeafNeighbors[]` are harmless: `mProbedMask` has only -bit 13 set, so `toProbe = neededMask & ~mProbedMask` will never return a stale index. +All 27 entries are reset to `kNullLeafID` on advance; `mProbedMask` is set to only +bit 13. `toProbe = neededMask & ~mProbedMask` therefore never returns a stale index. --- @@ -142,7 +151,7 @@ while any_of(activeMask): constructor and `advance()`: ```cpp -mLeafNeighbors[dir(0,0,0)] = &mGrid.tree().getFirstLeaf()[mCenterLeafID]; +mNeighborLeafIDs[dir(0,0,0)] = mCenterLeafID; mProbedMask = (1u << dir(0,0,0)); // bit 13 pre-set ``` @@ -246,17 +255,17 @@ for each (sx,sy,sz) in {±1}³: ## 8. Implementation Notes -### 8a. Lane loop in prefetch / cachedGetValue +### 8a. Lane loops in prefetch / cachedGetValue -The current implementation uses a scalar `for (int i = 0; i < LaneWidth; ++i)` loop -over lanes, using `simd_traits::get` / `set` for per-lane access. This is correct -for both scalar (LaneWidth=1) and SIMD (LaneWidth=W) instantiations. +`prefetch` uses a scalar `for (int i = 0; i < LaneWidth; ++i)` loop over lanes. It is +called at most once per direction per center leaf, so the loop is not +performance-critical. -`prefetch` is called at most once per direction per center leaf, so the loop is not -performance-critical. `cachedGetValue` is in the hot path; the loop over W=16 lanes -with scalar per-lane `leaf->getValue(offset)` is a first correct implementation. -Vectorising this loop (SIMD offset arithmetic + `vgatherdps`) is the Phase 2 -optimisation task described in `StencilGather.md §7b`. +`cachedGetValue` uses a scalar loop only for the legacy scalar value-fetch path (the +authoritative result path until the full SIMD index pipeline is wired in). The +ingredient-fetch block — `mOffset`, `mPrefixSum[w]`, and `valueMask().words()[w]` for +all active lanes — is already **fully SIMD** via the gather chain described in §8d. +`prefetch` remains scalar and is not performance-critical. ### 8b. No tree accessor in prefetch @@ -271,6 +280,72 @@ internal-node caches are bypassed entirely when `get` misses at LEVEL=0 outside the active narrow band. `cachedGetValue` checks for `nullptr` and returns `ScalarValueT(0)`, which is correct for level-set grids (background value = 0). +### 8d. SWAR direction extraction — the base-32 multiply trick + +`cachedGetValue` must compute a **per-lane** neighbor direction `dir ∈ [0,26]` at +runtime, because for a fixed compile-time tap `(di, dj, dk)` different lanes can land +in different neighbor leaves (one lane may cross only the z-face; another may cross +x and z; another may stay in the center leaf). + +`dir` is the mixed-radix value `dir = cz + 3·cy + 9·cx` where each carry component +`cz, cy, cx ∈ {0,1,2}` encodes {underflow, in-leaf, overflow} for the z-, y-, x-axis +respectively. The carry components are already sitting inside the SWAR `packed_sum` +(see §8a / `prefetch` implementation) at bit positions [3:4], [8:9], [13:14]. + +**Step 1 — extract carry pairs into base-32 digits** + +```cpp +// mask the six carry bits, right-shift by 3 +// result layout: 0b 00xx 000 yy 000 zz (three 2-bit fields, 3-bit gaps) +auto v = (packed_sum & VoxelOffsetT(0x6318u)) >> 3; +``` + +The 3-bit gaps are not accidental: the 5-bit SWAR groups naturally give a +**base-32 representation**. With the `>> 3` shift, `v` is the 3-digit duotrigesimal +(base-32) number `0d cx·cy·cz`, where digit-k = the carry component for axis k. + +**Step 2 — re-evaluate the same digits in base 3 via a single multiply** + +```cpp +// 0d 1'3'9 = 1·32² + 3·32 + 9 = 1024 + 96 + 9 = 1129 +auto dir_vec = (v * VoxelOffsetT(1129u)) >> 10; +// bits [10:14] of the product = digit-2 of v·(0d 1'3'9) = cz + 3·cy + 9·cx = dir +``` + +**Why digit-2 of the product equals `dir`:** + +Base-32 long multiplication `(0d cx·cy·cz) × (0d 1·3·9)`: + +| Digit of product | Contributions | Max value | +|---|---|---| +| 0 | 9·cz | 18 | +| 1 | 3·cz + 9·cy | 24 | +| **2** | **cz + 3·cy + 9·cx** | **26** | +| 3 | cy + 3·cx | 8 | +| 4 | cx | 2 | + +Every digit sum is **< 32**, so **no carries propagate between base-32 digits**. +Digit 2 is therefore exact: it equals `cz + 3·cy + 9·cx = dir` with no contamination +from adjacent digits. Digit-2 occupies bits [10:14] of the integer product, which is +why `>> 10` (and an optional `& 31`) extracts it. + +**Overflow note:** `v` fits in `uint16_t` (max = 2 + 2·32 + 2·1024 = 2114), and +`v · 1129` reaches up to 2 386 706 — a 22-bit value that overflows `uint16_t`. +**No widening is required**, however: we extract bits [10:14] of the product, and those +bits sit entirely below bit 16. Masking to 16 bits removes only bits 16+, leaving +bits [10:14] intact. The `uint16_t` modular product gives the same result as the +full-width product for all valid and sentinel inputs. + +**Compile-time sanity check** (all 27 valid inputs): + +```cpp +for (int cx : {0,1,2}) for (int cy : {0,1,2}) for (int cz : {0,1,2}) { + uint32_t v = cz + 32*cy + 1024*cx; + uint32_t dir = (v * 1129u) >> 10; + assert(dir == unsigned(cz + 3*cy + 9*cx)); +} +``` + --- ## 9. Relationship to Phase 1 Prototype @@ -290,9 +365,11 @@ outside the active narrow band. `cachedGetValue` checks for `nullptr` and retur ## 10. Future Work -- **`cachedGetValue` vectorisation (Phase 2):** replace per-lane scalar loop with SIMD - offset arithmetic + `vgatherdps` × 2 + `vpblendvb` for the two-pointer case. - See `StencilGather.md §7b` for the AVX2 profile. +- **`cachedGetValue` vectorisation (Phase 2):** ingredient fetch (`mOffset`, + `mPrefixSum[w]`, `valueMask().words()[w]`) is now fully SIMD via the gather chain + in §8d. Remaining: popcount `(maskWord & partial_mask)` → global value index → + `gather_if(result, leafMask, globalValueArray, indices)` to replace the scalar + `leaf->getValue(offset)` loop. See `StencilGather.md §7b` for the AVX2 profile. - **`getValue`:** lazy combined `prefetch` + `cachedGetValue`. diff --git a/simd_test/Simd.h b/simd_test/Simd.h index 6cea2fe50e..59c3c8a0c0 100644 --- a/simd_test/Simd.h +++ b/simd_test/Simd.h @@ -358,6 +358,26 @@ NANOVDB_SIMD_HOSTDEV void gather_if(Simd& dst, SimdMask mask, #endif // NANOVDB_USE_STD_SIMD +// --------------------------------------------------------------------------- +// simd_cast — element-wise static_cast between Simd types of the same W. +// +// Used for widening (uint16_t → uint32_t, uint32_t → uint64_t) and for +// reinterpreting signedness (uint32_t → int32_t) when building gather indices. +// Both backends: the array backend uses a lane loop; the stdx backend uses the +// generator constructor, which the compiler lowers to a vpmovsxbw / vpmovzxwd +// sequence or similar sign/zero-extend instruction depending on the types. +// --------------------------------------------------------------------------- +template +NANOVDB_SIMD_HOSTDEV Simd simd_cast(Simd src) { +#ifdef NANOVDB_USE_STD_SIMD + return Simd([&](int i) { return static_cast(src[i]); }); +#else + Simd r; + for (int i = 0; i < W; ++i) r[i] = static_cast(src[i]); + return r; +#endif +} + // --------------------------------------------------------------------------- // simd_traits — generic per-lane access for scalar and Simd types. // From 6a4711bec301fce7310317f9893823c9fa097981 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Thu, 16 Apr 2026 13:18:49 -0500 Subject: [PATCH 20/60] BatchAccessor.md: document prefetch SIMD structure + assembly findings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit §8a was stale ("prefetch uses a scalar loop") — prefetch has no per-lane loop. Update with accurate description of its two-phase structure: - SWAR expansion + vpblendvb + vpaddw: fully in YMM across all LaneWidth lanes (vpsllw, vpor, vpand, vpblendvb, vpaddw) - Horizontal reduction: unavoidable vextracti128 + vpand/vpor tree to produce scalar hor_and / hor_or for the per-axis crossing decision Include the actual Release assembly (ex_stencil_gather_cpu, -O3 -mavx2) confirming the YMM path survived the mLeafNeighbors → mNeighborLeafIDs encoding change. §8c: update nullptr sentinel language to reflect kNullLeafID / valid_u32 mask in the SIMD gather chain. Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Efty Sifakis --- .../BatchAccessor.md | 55 +++++++++++++++---- 1 file changed, 44 insertions(+), 11 deletions(-) diff --git a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.md b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.md index 58b463b6b8..6b3e98dd7d 100644 --- a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.md +++ b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.md @@ -255,17 +255,47 @@ for each (sx,sy,sz) in {±1}³: ## 8. Implementation Notes -### 8a. Lane loops in prefetch / cachedGetValue +### 8a. SIMD structure of prefetch and cachedGetValue -`prefetch` uses a scalar `for (int i = 0; i < LaneWidth; ++i)` loop over lanes. It is -called at most once per direction per center leaf, so the loop is not -performance-critical. +**`prefetch` — fully SIMD for the crossing detection, scalar only for probeLeaf** -`cachedGetValue` uses a scalar loop only for the legacy scalar value-fetch path (the -authoritative result path until the full SIMD index pipeline is wired in). The -ingredient-fetch block — `mOffset`, `mPrefixSum[w]`, and `valueMask().words()[w]` for -all active lanes — is already **fully SIMD** via the gather chain described in §8d. -`prefetch` remains scalar and is not performance-critical. +`prefetch` contains no per-lane scalar loop. The crossing decision uses: + +1. **SWAR expansion** (YMM throughout): `vpsllw`, `vpor`, `vpand` — maps the 9-bit + voxel offset vector into the 15-bit packed form across all LaneWidth lanes. +2. **Sentinel blend**: `vpblendvb` — applies `leafMask` in one instruction. +3. **Add**: `vpaddw` — adds the compile-time `packed_tap` across all lanes. +4. **Horizontal reductions**: `vextracti128` + `vpand`/`vpor` tree → scalar `hor_and` + / `hor_or` — unavoidable for the crossing decision, which is a single bool per axis. + +Assembly-confirmed (Release, `-O3 -mavx2`, `ex_stencil_gather_cpu`): + +``` +vmovdqu (%rbx,%rax,2),%ymm2 ; load vo (16 × uint16_t) +vpsllw $0x4,%ymm2,%ymm0 ; vo << 4 +vpor %ymm2,%ymm0,%ymm0 ; vo | (vo << 4) +vpand %ymm1,%ymm0,%ymm0 ; & 0x1C07 +vpsllw $0x2,%ymm2,%ymm1 ; vo << 2 +vpand %ymm2,%ymm1,%ymm1 ; & 0xE0 +vpor %ymm1,%ymm0,%ymm0 ; → expanded +vpblendvb %ymm1,%ymm0,%ymm6,%ymm1 ; where(leafMask, packed_lc) = expanded +vpaddw %ymm2,%ymm1,%ymm1 ; packed_sum = packed_lc + packed_tap +vextracti128 $0x1,%ymm1,%xmm2 ; \ +vpand %xmm1,%xmm2,%xmm2 ; | hor_and tree: +vpunpckhwd ... ; | 16→8→4→2→1 lanes +vpand ...; vpshufd ...; vpand .. ; | +vpextrw $0x0,%xmm1,%eax ; / scalar hor_and +``` + +After the scalar crossing check, `probeLeaf` is called at most once per unique +direction per center leaf — inherently scalar tree traversal, not per-voxel. + +**`cachedGetValue` — SIMD ingredient fetch, scalar value path** + +The ingredient-fetch block — `mOffset`, `mPrefixSum[w]`, and `valueMask().words()[w]` +for all active lanes — is **fully SIMD** via the gather chain described in §8d. +The final value-fetch (scalar loop over `leaf->getValue(offset)`) is the remaining +work before the full SIMD index pipeline is wired in. ### 8b. No tree accessor in prefetch @@ -277,8 +307,11 @@ internal-node caches are bypassed entirely when `get` misses at LEVEL=0 ### 8c. probeLeaf returns nullptr for missing neighbors `mGrid.tree().probeLeaf(coord)` returns `nullptr` when the requested coordinate lies -outside the active narrow band. `cachedGetValue` checks for `nullptr` and returns -`ScalarValueT(0)`, which is correct for level-set grids (background value = 0). +outside the active narrow band. `prefetch` stores `kNullLeafID` in +`mNeighborLeafIDs[d]` for those directions. `cachedGetValue` detects `kNullLeafID` +and writes `ScalarValueT(0)` for those lanes, which is correct for level-set grids +(background value = 0). The SIMD gather chain masks out `kNullLeafID` lanes via the +`valid_u32` mask before accessing any leaf data. ### 8d. SWAR direction extraction — the base-32 multiply trick From d511842d1065531b2dd2512f036b8586563ba4c4 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Thu, 16 Apr 2026 20:15:45 -0500 Subject: [PATCH 21/60] BatchAccessor: SWAR constant naming, ASCII cleanup, explicit parentheses - Promote SWAR encoding literals to class-scope uint16_t constants: kSwarXZMask (0x1C07), kSwarYMask (0x00E0), kSwarSentinel (4|4<<5|4<<10). Shared between prefetch and cachedGetValue; implicit conversion to VoxelOffsetT at Simd construction time. - Add direction-extraction local constants in cachedGetValue: kSwarCarryMask (0x6318), kDirMul (1129), kDirMask (31). - Rename w_vec -> wordIndex for clarity. - Move U64Traits alias inside #ifndef NDEBUG where it is only used. - Replace non-ASCII characters (em dashes, arrows, element-of, comparison operators) with ASCII equivalents throughout. - Add explicit parentheses around the d_u16 shift-then-mask chain to make operator precedence unambiguous. Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Efty Sifakis --- .../BatchAccessor.h | 186 +++++++++--------- 1 file changed, 96 insertions(+), 90 deletions(-) diff --git a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.h b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.h index 024d0e747c..56b888aacf 100644 --- a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.h +++ b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.h @@ -6,7 +6,7 @@ \brief SIMD-batch analog of NanoVDB's ValueAccessor. - Caches the 27-entry 3×3×3 leaf-neighbor pointer table around the current + Caches the 27-entry 3x3x3 leaf-neighbor pointer table around the current center leaf, amortizing probeLeaf calls across all batches that process voxels within that leaf. @@ -21,7 +21,7 @@ For NanoGrid: uint64_t or Simd VoxelOffsetT Compact (9-bit) voxel offset within a leaf. Scalar path: uint16_t. SIMD path: Simd. - LeafIDT Leaf index type — reserved for future use by the caller loop. + LeafIDT Leaf index type -- reserved for future use by the caller loop. Scalar: uint32_t. SIMD: Simd. PredicateT Per-lane active predicate (the leafMask). Scalar: bool. SIMD: SimdMask or similar. @@ -31,10 +31,10 @@ Scalar defaults allow instantiation without a SIMD library. For SIMD use, substitute the concrete Simd<> and SimdMask<> types. - API (see BatchAccessor.md §5 for the full design): - - advance(newLeafID) — move to a new center leaf - - prefetch(vo, mask) — warm cache for tap (di,dj,dk) - - cachedGetValue(result, vo, mask) — fill masked result lanes + API (see BatchAccessor.md Sec.5 for the full design): + - advance(newLeafID) -- move to a new center leaf + - prefetch(vo, mask) -- warm cache for tap (di,dj,dk) + - cachedGetValue(result, vo, mask) -- fill masked result lanes */ #pragma once @@ -77,7 +77,7 @@ class BatchAccessor static_assert(Val_traits::width == 1 || Val_traits::width == VO_traits::width, "BatchAccessor: ValueT lane width must be 1 (scalar) or match VoxelOffsetT"); - // The SWAR packed layout in prefetch occupies bits 0–14 of each element + // The SWAR packed layout in prefetch occupies bits 0-14 of each element // (max packed value 0x1CE7, max sum 0x4A52). The element type must therefore // be an unsigned integer of at least 16 bits; signed types produce UB on // carry overflow, and 8-bit types cannot hold the packed fields. @@ -93,16 +93,16 @@ class BatchAccessor // ------------------------------------------------------------------------- // Direction encoding // - // dir(dx,dy,dz) = (dx+1)*9 + (dy+1)*3 + (dz+1), dx,dy,dz ∈ {-1,0,+1} + // dir(dx,dy,dz) = (dx+1)*9 + (dy+1)*3 + (dz+1), dx,dy,dz in {-1,0,+1} // // Selected entries: - // dir( 0, 0, 0) = 13 — center leaf (mNeighborLeafIDs[13]) - // dir(-1, 0, 0) = 4 — x-minus face - // dir(+1, 0, 0) = 22 — x-plus face - // dir( 0,-1, 0) = 10 — y-minus face - // dir( 0,+1, 0) = 16 — y-plus face - // dir( 0, 0,-1) = 12 — z-minus face - // dir( 0, 0,+1) = 14 — z-plus face + // dir( 0, 0, 0) = 13 -- center leaf (mNeighborLeafIDs[13]) + // dir(-1, 0, 0) = 4 -- x-minus face + // dir(+1, 0, 0) = 22 -- x-plus face + // dir( 0,-1, 0) = 10 -- y-minus face + // dir( 0,+1, 0) = 16 -- y-plus face + // dir( 0, 0,-1) = 12 -- z-minus face + // dir( 0, 0,+1) = 14 -- z-plus face // // Sentinel leaf ID for directions outside the narrow band (no leaf exists). // ------------------------------------------------------------------------- @@ -112,6 +112,20 @@ class BatchAccessor } static constexpr uint32_t kNullLeafID = ~uint32_t(0); + // ------------------------------------------------------------------------- + // SWAR 15-bit packed encoding constants + // + // packed layout: lx[10:12] | gap[13:14] | ly[5:7] | gap[8:9] | lz[0:2] | gap[3:4] + // + // kSwarXZMask -- keeps lz [0:2] and lx [6:8->10:12] after (vo | vo<<4) + // kSwarYMask -- keeps ly [3:5->5:7] after (vo<<2) + // kSwarSentinel-- inactive-lane value: lx=ly=lz=4, chosen so that + // (sentinel + tap) never triggers a false crossing signal + // ------------------------------------------------------------------------- + static constexpr uint16_t kSwarXZMask = 0x1C07u; + static constexpr uint16_t kSwarYMask = 0x00E0u; + static constexpr uint16_t kSwarSentinel = 4u | (4u << 5u) | (4u << 10u); + // ------------------------------------------------------------------------- // Construction // @@ -135,13 +149,13 @@ class BatchAccessor } // ------------------------------------------------------------------------- - // advance — move to a new center leaf + // advance -- move to a new center leaf // // Call when none_of(leafMask): all active lanes have moved past mCenterLeafID. // Resets all neighbor IDs to kNullLeafID, repopulates the center eagerly, // and resets mProbedMask to bit 13 so the center is immediately valid. // Resetting all 27 IDs (108 bytes) ensures mNeighborLeafIDs[d] == kNullLeafID - // iff bit d is absent from mProbedMask — a clean invariant for SIMD gather. + // iff bit d is absent from mProbedMask -- a clean invariant for SIMD gather. // ------------------------------------------------------------------------- void advance(uint32_t newLeafID) { @@ -153,7 +167,7 @@ class BatchAccessor } // ------------------------------------------------------------------------- - // prefetch — warm the neighbor cache for stencil tap (di,dj,dk) + // prefetch -- warm the neighbor cache for stencil tap (di,dj,dk) // // For each active (leafMask) lane, computes which neighbor leaf the tap lands // in and probes it into mLeafNeighbors[] if not already cached in mProbedMask. @@ -175,46 +189,46 @@ class BatchAccessor // Replace the scalar per-lane loop with a single SIMD add + two horizontal // reductions, using a 15-bit packed coordinate representation. // - // packed_lc layout — 5-bit groups, tightly packed, no inter-group gaps: - // bits 0– 2: lz carry region bits 3–4 (z-axis) - // bits 5– 7: ly carry region bits 8–9 (y-axis) - // bits 10–12: lx carry region bits 13–14 (x-axis) + // packed_lc layout -- 5-bit groups, tightly packed, no inter-group gaps: + // bits 0- 2: lz carry region bits 3-4 (z-axis) + // bits 5- 7: ly carry region bits 8-9 (y-axis) + // bits 10-12: lx carry region bits 13-14 (x-axis) // // All carry bits land within [0:14], fitting cleanly in uint16_t with // bit 15 unused. The z,y,x ordering matches the weight sequence in - // dir(): (dz+1)×1 + (dy+1)×3 + (dx+1)×9. + // dir(): (dz+1)x1 + (dy+1)x3 + (dx+1)x9. // // packed_tap = stencil offsets biased by +8, placed in the same groups: - // (dk+8) at bits [0:...] dk+8 ∈ [5,11] for dk ∈ [-3,3] + // (dk+8) at bits [0:...] dk+8 in [5,11] for dk in [-3,3] // (dj+8) at bits [5:...] // (di+8) at bits [10:...] // // The +8 bias shifts the zero point so that the per-group sum - // s = lc + (d+8), lc ∈ [0,7], d ∈ [-3,3] → s ∈ [5,18] + // s = lc + (d+8), lc in [0,7], d in [-3,3] -> s in [5,18] // encodes the neighbor coordinate measured from the (-1,-1,-1) leaf: - // s ∈ [ 5, 7]: component + d < 0 → lo-neighbor (d < 0 case) - // s ∈ [ 8,15]: component + d ∈ [0,7] → center leaf - // s ∈ [16,18]: component + d ≥ 8 → hi-neighbor (d > 0 case) + // s in [ 5, 7]: component + d < 0 -> lo-neighbor (d < 0 case) + // s in [ 8,15]: component + d in [0,7] -> center leaf + // s in [16,18]: component + d >= 8 -> hi-neighbor (d > 0 case) // // Carry bits after add: - // bit[+3] SET ↔ s ≥ 8 (= no lo-crossing) - // bit[+4] SET ↔ s ≥ 16 (= hi-crossing) + // bit[+3] SET <=> s >= 8 (= no lo-crossing) + // bit[+4] SET <=> s >= 16 (= hi-crossing) // // For prefetch, only one bit per axis is needed (compile-time dispatch): - // dk > 0: z_cross = hor_or & (1 << 4) — any lane has hi-z carry - // dk < 0: z_cross = !(hor_and & (1 << 3)) — any lane lacks lo-z guard + // dk > 0: z_cross = hor_or & (1 << 4) -- any lane has hi-z carry + // dk < 0: z_cross = !(hor_and & (1 << 3)) -- any lane lacks lo-z guard // (same at bits [9]/[8] for y, bits [14]/[13] for x) // - // Inactive lanes carry sentinel lc = 4 per axis: s = d+12 ∈ [9,15] - // → bit[+3]=1, bit[+4]=0 → no crossing signal regardless of d. ✓ + // Inactive lanes carry sentinel lc = 4 per axis: s = d+12 in [9,15] + // -> bit[+3]=1, bit[+4]=0 -> no crossing signal regardless of d. (ok) // // For multi-axis taps, may-cross flags are combined conservatively. // ----------------------------------------------------------------------- // Use VoxelOffsetT directly for the packed arithmetic: LaneWidth elements - // of VoxelOffsetScalarT in one register → one vpaddw (16-bit) or vpaddd + // of VoxelOffsetScalarT in one register -> one vpaddw (16-bit) or vpaddd // (32-bit) depending on the instantiation. All intermediate values fit: - // packed_lc ≤ 0x1CE7, packed_tap ≤ 0x2D6B, sum ≤ 0x4A52 < 2^16. + // packed_lc <= 0x1CE7, packed_tap <= 0x2D6B, sum <= 0x4A52 < 2^16. // Compile-time packed stencil offset (+8-biased per axis, 5-bit groups). static constexpr auto packed_tap = @@ -223,29 +237,18 @@ class BatchAccessor | ((unsigned(dj) + 8u) << 5) | ((unsigned(di) + 8u) << 10)); - // Sentinel for inactive lanes: lc = (4,4,4) → packed = 4|(4<<5)|(4<<10). - // Straddle lanes carry arbitrary vo from the next leaf, so we must apply - // leafMask before the add to avoid false crossing signals. - static constexpr auto kSentinel15 = - static_cast(4u | (4u << 5u) | (4u << 10u)); - // Data mask: keeps bits [0:2], [5:7], [10:12] — the three 3-bit data fields. - static constexpr auto kMask15 = - static_cast(0b111'00'111'00'111u); - // Expand the 9-bit voxel offset into the 15-bit SWAR packed form. // vo = lx[6:8] | ly[3:5] | lz[0:2] (NanoVDB leaf layout) // target: lx[10:12] | ly[5:7] | lz[0:2] // - // (vo | (vo<<4)) & 0x1C07 places lz (stays at [0:2]) and lx ([6:8]→[10:12]) - // in one OR+mask; (vo<<2) & 0xE0 moves ly ([3:5]→[5:7]). + // (vo | (vo<<4)) & kSwarXZMask places lz (stays at [0:2]) and lx ([6:8]->[10:12]) + // in one OR+mask; (vo<<2) & kSwarYMask moves ly ([3:5]->[5:7]). const auto expanded = - ((vo | (vo << VoxelOffsetScalarT(4))) & VoxelOffsetT(0x1C07u)) - | ((vo << VoxelOffsetScalarT(2)) & VoxelOffsetT(0xE0u)); + ((vo | (vo << VoxelOffsetScalarT(4))) & VoxelOffsetT(kSwarXZMask)) + | ((vo << VoxelOffsetScalarT(2)) & VoxelOffsetT(kSwarYMask)); - // Blend: active lanes → expanded form, straddle/inactive → sentinel. - // util::where(mask, target) = value uses the stdx-style 2-argument proxy: - // packed_lc is pre-initialised to kSentinel15; active lanes are overwritten. - auto packed_lc = VoxelOffsetT(kSentinel15); + // Blend: active lanes -> expanded form, straddle/inactive -> sentinel. + auto packed_lc = VoxelOffsetT(kSwarSentinel); util::where(leafMask, packed_lc) = expanded; // One SIMD add across all LaneWidth lanes (one vpaddw/vpaddd instruction). @@ -299,7 +302,7 @@ class BatchAccessor } // ------------------------------------------------------------------------- - // cachedGetValue — fill masked result lanes from cached leaf table + // cachedGetValue -- fill masked result lanes from cached leaf table // // For each active (leafMask) lane, computes the local voxel offset within the // appropriate neighbor leaf and calls leaf->getValue(offset). @@ -314,20 +317,20 @@ class BatchAccessor void cachedGetValue(ValueT& result, VoxelOffsetT vo, PredicateT leafMask) const { // ----------------------------------------------------------------------- - // SIMD ingredient fetch (WIP — not yet wired to result) + // SIMD ingredient fetch (WIP -- not yet wired to result) // // Recomputes packed_sum (same SWAR expansion as prefetch) to extract the // three per-lane ingredients needed to replace leaf->getValue() with fully - // SIMD index arithmetic + value gather. See BatchAccessor.md §8d. + // SIMD index arithmetic + value gather. See BatchAccessor.md Sec.8d. // - // offsets — leaf->mOffset: base value index for the leaf - // prefixSums — leaf->mPrefixSum[w]: prefix popcount up to x-slice w - // maskWords — leaf->mMask.mWords[w]: uint64_t mask for x-slice w + // offsets -- leaf->mOffset: base value index for the leaf + // prefixSums -- leaf->mPrefixSum[w]: prefix popcount up to x-slice w + // maskWords -- leaf->mMask.mWords[w]: uint64_t mask for x-slice w // // w = dest_x = bits [10:12] of packed_sum (NanoVDB leaf layout: x is // the most significant axis, so x-slices index the eight uint64_t words). // - // dir per lane is extracted via the base-32 multiply trick (§8d): + // dir per lane is extracted via the base-32 multiply trick (Sec.8d): // v = (packed_sum & 0x6318u) >> 3 // dir = (v * 1129u) >> 10 // @@ -335,93 +338,95 @@ class BatchAccessor // verification against LeafData in NanoVDB.h. // ----------------------------------------------------------------------- { - const auto expanded = - ((vo | (vo << VoxelOffsetScalarT(4))) & VoxelOffsetT(0x1C07u)) - | ((vo << VoxelOffsetScalarT(2)) & VoxelOffsetT(0x00E0u)); - static constexpr auto packed_tap = static_cast( (unsigned(dk) + 8u) | ((unsigned(dj) + 8u) << 5) | ((unsigned(di) + 8u) << 10)); - static constexpr auto kSentinel15 = - static_cast(4u | (4u << 5u) | (4u << 10u)); + const auto expanded = + ((vo | (vo << VoxelOffsetScalarT(4))) & VoxelOffsetT(kSwarXZMask)) + | ((vo << VoxelOffsetScalarT(2)) & VoxelOffsetT(kSwarYMask)); - auto packed_lc = VoxelOffsetT(kSentinel15); + auto packed_lc = VoxelOffsetT(kSwarSentinel); util::where(leafMask, packed_lc) = expanded; const auto packed_sum = packed_lc + VoxelOffsetT(packed_tap); - // w per lane: dest_x = bits [10:12] → index of the uint64_t mask word - const auto w_vec = (packed_sum >> VoxelOffsetScalarT(10)) & VoxelOffsetT(7u); + // dest_x per lane: bits [10:12] of packed_sum -> uint64_t mask word index (0..7) + const auto wordIndex = (packed_sum >> VoxelOffsetScalarT(10)) & VoxelOffsetT(7u); // SIMD gather of mOffset, mPrefixSum[w], and maskWords[w] per lane. // - // Step 1 — d_vec: per-lane dir (0..26) via base-32 multiply trick (§8d). + // Step 1 -- d_vec: per-lane dir (0..26) via base-32 multiply trick (Sec.8d). // No widening needed: we extract bits [10:14] of (v * 1129). Those // bits lie entirely below bit 16, so the modular uint16_t product gives // the same answer as the full-width product for all valid + sentinel inputs. // - // Step 2 — leaf_id_vec: gather mNeighborLeafIDs[d] for all lanes at once. + // Step 2 -- leaf_id_vec: gather mNeighborLeafIDs[d] for all lanes at once. // - // Step 3 — raw_idx: leaf_id * (sizeof(LeafT)/sizeof(uint64_t)). + // Step 3 -- raw_idx: leaf_id * (sizeof(LeafT)/sizeof(uint64_t)). // This is the per-lane uint64_t-stride index into the flat leaf array, // viewed as a uint64_t[] through the base pointer of the target field. // Invalid (kNullLeafID) lanes are clamped to index 0 (safe; masked out). // - // Step 4 — offsets / prefixSums: two gathers with different base pointers + // Step 4 -- offsets / prefixSums: two gathers with different base pointers // but the same raw_idx; masked to 0 for null lanes. // mPrefixSum is a packed uint64_t: field w lives at bits [9*(w-1)+:9] - // (9-bit fields, w=0 → prefix = 0 by definition). + // (9-bit fields, w=0 -> prefix = 0 by definition). // - // Step 5 — maskWords: gather from valueMask().words() base. + // Step 5 -- maskWords: gather from valueMask().words() base. // words()[wi] for leaf[leaf_id] = mask_word_base[leaf_id*kStride + wi]. // The per-lane wi is added to raw_idx to form the mask gather index. using U32T = util::Simd; using U64T = util::Simd; - using U64Traits = util::simd_traits; - // Step 1 — d_vec: per-lane dir (0..26) via base-32 multiply (§8d). + // Direction-extraction constants (base-32 multiply trick, Sec.8d). + static constexpr uint16_t kSwarCarryMask = 0x6318u; // carry bits [3:4],[8:9],[13:14] + static constexpr uint16_t kDirMul = 1129u; // base-32 multiplier: 1*32^2 + 3*32 + 9 + static constexpr uint16_t kDirMask = 31u; // 5-bit digit mask + + // Step 1 -- d_vec: per-lane dir (0..26) via base-32 multiply (Sec.8d). // Stay in uint16_t throughout: bits [10:14] of (v * 1129) are entirely // within the lower 16 bits, so the modular uint16_t product gives the // same result as the full-width product for all valid inputs. - const auto d_u16 = ((packed_sum & VoxelOffsetT(0x6318u)) - >> VoxelOffsetScalarT(3)) - * VoxelOffsetT(1129u) >> VoxelOffsetScalarT(10) - & VoxelOffsetT(31u); + const auto d_u16 = (((packed_sum & VoxelOffsetT(kSwarCarryMask)) + >> VoxelOffsetScalarT(3)) + * VoxelOffsetT(kDirMul) + >> VoxelOffsetScalarT(10)) + & VoxelOffsetT(kDirMask); const auto d_i32 = util::simd_cast(d_u16); - // Step 2 — leaf IDs + // Step 2 -- leaf IDs const auto leaf_id_vec = util::gather(mNeighborLeafIDs, d_i32); // Simd const auto valid_u32 = (leaf_id_vec != U32T(kNullLeafID)); // SimdMask - // Step 3 — stride-scaled gather indices (null lanes → 0) + // Step 3 -- stride-scaled gather indices (null lanes -> 0) static constexpr uint32_t kStride = sizeof(LeafT) / sizeof(uint64_t); const auto raw_idx = util::simd_cast( util::where(valid_u32, leaf_id_vec * U32T(kStride), U32T(0))); - // Step 4a — offsets (mOffset) + // Step 4a -- offsets (mOffset) const uint64_t* offset_base = reinterpret_cast( &mGrid.tree().getFirstLeaf()[0].data()->mOffset); const U64T offsets = util::where(valid_u32, util::gather(offset_base, raw_idx), U64T(0)); - // Step 4b — prefixSums (mPrefixSum packed uint64_t, shift-extract field w) + // Step 4b -- prefixSums (mPrefixSum packed uint64_t, shift-extract field w) const uint64_t* prefix_base = reinterpret_cast( &mGrid.tree().getFirstLeaf()[0].data()->mPrefixSum); const auto prefix_raw = util::gather(prefix_base, raw_idx); - const auto w_u64 = util::simd_cast(w_vec); + const auto w_u64 = util::simd_cast(wordIndex); const auto nonzero_w = (w_u64 != U64T(0)); const auto shift = util::where(nonzero_w, (w_u64 - U64T(1)) * U64T(9), U64T(0)); const U64T prefixSums = util::where(valid_u32, util::where(nonzero_w, (prefix_raw >> shift) & U64T(511u), U64T(0)), U64T(0)); - // Step 5 — maskWords (valueMask().words()[w]) + // Step 5 -- maskWords (valueMask().words()[w]) // mask_word_base[leaf_id*kStride + w] == leaf[leaf_id].valueMask().words()[w] // because the mask field is at a fixed offsetof within every LeafT. const uint64_t* mask_word_base = mGrid.tree().getFirstLeaf()[0].valueMask().words(); - const auto w_i32 = util::simd_cast(util::simd_cast(w_vec)); + const auto w_i32 = util::simd_cast(util::simd_cast(wordIndex)); const auto mask_idx = raw_idx + w_i32; const U64T maskWords = util::where(valid_u32, util::gather(mask_word_base, mask_idx), U64T(0)); @@ -429,6 +434,7 @@ class BatchAccessor // Debug cross-check: validate SIMD-path values against scalar ref // ------------------------------------------------------------------- #ifndef NDEBUG + using U64Traits = util::simd_traits; for (int i = 0; i < LaneWidth; ++i) { if (!Pred_traits::get(leafMask, i)) continue; @@ -448,7 +454,7 @@ class BatchAccessor // SIMD-path values for this lane const uint32_t ps_i = static_cast(VO_traits::get(packed_sum, i)); const int d_simd = int((((ps_i & 0x6318u) >> 3) * 1129u >> 10) & 31u); - const int wi = int(VO_traits::get(w_vec, i)); + const int wi = int(VO_traits::get(wordIndex, i)); assert(d_simd == d_ref && "cachedGetValue SIMD: dir mismatch"); assert(wi == nx_w && "cachedGetValue SIMD: w (dest_x) mismatch"); @@ -477,7 +483,7 @@ class BatchAccessor } // ----------------------------------------------------------------------- - // Legacy scalar path — authoritative until SIMD path is wired in + // Legacy scalar path -- authoritative until SIMD path is wired in // ----------------------------------------------------------------------- for (int i = 0; i < LaneWidth; ++i) { if (!Pred_traits::get(leafMask, i)) continue; From 87c3e1b3ab248f70caf528eeb02f662ec73d66c3 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Fri, 17 Apr 2026 07:39:26 -0500 Subject: [PATCH 22/60] BatchAccessor/Simd: scalar overloads, LeafIDVecT/LeafDataVecT class aliases Simd.h: - Add scalar simd_cast(SrcT) overload (degrades to static_cast). - Add scalar 2-arg where(bool, T&) masked-assignment proxy matching the SIMD WhereExpression form. - Add scalar gather(const T*, int32_t) and gather_if(T&, bool, ...) to complete the scalar overload set alongside the existing where(bool,T,T). BatchAccessor.h: - Remove LeafIDT template parameter (was reserved/unused; now derived). - Add private class-scope LeafIDVecT and LeafDataVecT using conditional_t: plain uint32_t/uint64_t when LaneWidth==1, Simd otherwise. This upholds the convention that scalar instantiations use underlying types directly rather than Simd wrappers. - Replace local U32T/U64T aliases in cachedGetValue with class-scope names. stencil_gather_cpu.cpp: - Drop the now-removed LeafIDT argument from the BAccT instantiation. Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Efty Sifakis --- .../stencil_gather_cpu.cpp | 1 - .../BatchAccessor.h | 58 ++++++++++--------- simd_test/Simd.h | 25 ++++++++ 3 files changed, 56 insertions(+), 28 deletions(-) diff --git a/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp b/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp index 2488e13c3b..a359e069fc 100644 --- a/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp +++ b/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp @@ -69,7 +69,6 @@ using LaneMask = nanovdb::util::SimdMask; using BAccT = nanovdb::BatchAccessor, // ValueT nanovdb::util::Simd, // VoxelOffsetT - nanovdb::util::Simd, // LeafIDT (unused by BatchAccessor internals) LaneMask>; // PredicateT // Direction bit encoding shared across all stencil types: diff --git a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.h b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.h index 56b888aacf..4362814522 100644 --- a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.h +++ b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.h @@ -21,10 +21,8 @@ For NanoGrid: uint64_t or Simd VoxelOffsetT Compact (9-bit) voxel offset within a leaf. Scalar path: uint16_t. SIMD path: Simd. - LeafIDT Leaf index type -- reserved for future use by the caller loop. - Scalar: uint32_t. SIMD: Simd. PredicateT Per-lane active predicate (the leafMask). - Scalar: bool. SIMD: SimdMask or similar. + Scalar: bool. SIMD: SimdMask. Usage ----- @@ -55,7 +53,6 @@ namespace nanovdb { template class BatchAccessor { @@ -72,8 +69,16 @@ class BatchAccessor static constexpr int LaneWidth = VO_traits::width; + // SIMD bundle types for the ingredient gather. + // Degrade to plain scalar types when LaneWidth == 1. + using LeafIDVecT = std::conditional_t>; + using LeafDataVecT = std::conditional_t>; + static_assert(VO_traits::width == Pred_traits::width, "BatchAccessor: VoxelOffsetT and PredicateT must have the same lane width"); + static_assert(std::is_same_v || + std::is_same_v>, + "BatchAccessor: PredicateT must be bool (scalar) or SimdMask (SIMD)"); static_assert(Val_traits::width == 1 || Val_traits::width == VO_traits::width, "BatchAccessor: ValueT lane width must be 1 (scalar) or match VoxelOffsetT"); @@ -376,9 +381,6 @@ class BatchAccessor // Step 5 -- maskWords: gather from valueMask().words() base. // words()[wi] for leaf[leaf_id] = mask_word_base[leaf_id*kStride + wi]. // The per-lane wi is added to raw_idx to form the mask gather index. - using U32T = util::Simd; - using U64T = util::Simd; - // Direction-extraction constants (base-32 multiply trick, Sec.8d). static constexpr uint16_t kSwarCarryMask = 0x6318u; // carry bits [3:4],[8:9],[13:14] static constexpr uint16_t kDirMul = 1129u; // base-32 multiplier: 1*32^2 + 3*32 + 9 @@ -395,31 +397,33 @@ class BatchAccessor & VoxelOffsetT(kDirMask); const auto d_i32 = util::simd_cast(d_u16); - // Step 2 -- leaf IDs - const auto leaf_id_vec = util::gather(mNeighborLeafIDs, d_i32); // Simd - const auto valid_u32 = (leaf_id_vec != U32T(kNullLeafID)); // SimdMask + // Step 2 -- leaf IDs: gather only for active lanes; inactive lanes keep kNullLeafID. + // valid_u32 is then the combined effective mask (leafMask AND neighbor exists). + LeafIDVecT leaf_id_vec(kNullLeafID); + util::gather_if(leaf_id_vec, leafMask, mNeighborLeafIDs, d_i32); + const auto valid_u32 = (leaf_id_vec != LeafIDVecT(kNullLeafID)); // SimdMask // Step 3 -- stride-scaled gather indices (null lanes -> 0) static constexpr uint32_t kStride = sizeof(LeafT) / sizeof(uint64_t); const auto raw_idx = util::simd_cast( - util::where(valid_u32, leaf_id_vec * U32T(kStride), U32T(0))); + util::where(valid_u32, leaf_id_vec * LeafIDVecT(kStride), LeafIDVecT(0))); // Step 4a -- offsets (mOffset) const uint64_t* offset_base = reinterpret_cast( &mGrid.tree().getFirstLeaf()[0].data()->mOffset); - const U64T offsets = util::where(valid_u32, - util::gather(offset_base, raw_idx), U64T(0)); + const LeafDataVecT offsets = util::where(valid_u32, + util::gather(offset_base, raw_idx), LeafDataVecT(0)); // Step 4b -- prefixSums (mPrefixSum packed uint64_t, shift-extract field w) const uint64_t* prefix_base = reinterpret_cast( &mGrid.tree().getFirstLeaf()[0].data()->mPrefixSum); const auto prefix_raw = util::gather(prefix_base, raw_idx); const auto w_u64 = util::simd_cast(wordIndex); - const auto nonzero_w = (w_u64 != U64T(0)); - const auto shift = util::where(nonzero_w, (w_u64 - U64T(1)) * U64T(9), U64T(0)); - const U64T prefixSums = util::where(valid_u32, - util::where(nonzero_w, (prefix_raw >> shift) & U64T(511u), U64T(0)), - U64T(0)); + const auto nonzero_w = (w_u64 != LeafDataVecT(0)); + const auto shift = util::where(nonzero_w, (w_u64 - LeafDataVecT(1)) * LeafDataVecT(9), LeafDataVecT(0)); + const LeafDataVecT prefixSums = util::where(valid_u32, + util::where(nonzero_w, (prefix_raw >> shift) & LeafDataVecT(511u), LeafDataVecT(0)), + LeafDataVecT(0)); // Step 5 -- maskWords (valueMask().words()[w]) // mask_word_base[leaf_id*kStride + w] == leaf[leaf_id].valueMask().words()[w] @@ -428,13 +432,13 @@ class BatchAccessor mGrid.tree().getFirstLeaf()[0].valueMask().words(); const auto w_i32 = util::simd_cast(util::simd_cast(wordIndex)); const auto mask_idx = raw_idx + w_i32; - const U64T maskWords = util::where(valid_u32, - util::gather(mask_word_base, mask_idx), U64T(0)); + const LeafDataVecT maskWords = util::where(valid_u32, + util::gather(mask_word_base, mask_idx), LeafDataVecT(0)); // ------------------------------------------------------------------- // Debug cross-check: validate SIMD-path values against scalar ref // ------------------------------------------------------------------- #ifndef NDEBUG - using U64Traits = util::simd_traits; + using LeafDataVecTraits = util::simd_traits; for (int i = 0; i < LaneWidth; ++i) { if (!Pred_traits::get(leafMask, i)) continue; @@ -463,18 +467,18 @@ class BatchAccessor const uint64_t pfx_ref = (uint32_t(nx_w) > 0u) ? (ref->data()->mPrefixSum >> (9u * (uint32_t(nx_w) - 1u))) & 511u : uint64_t(0); - assert(U64Traits::get(offsets, i) == ref->data()->mOffset + assert(LeafDataVecTraits::get(offsets, i) == ref->data()->mOffset && "cachedGetValue SIMD: mOffset mismatch"); - assert(U64Traits::get(prefixSums, i) == pfx_ref + assert(LeafDataVecTraits::get(prefixSums, i) == pfx_ref && "cachedGetValue SIMD: mPrefixSum mismatch"); - assert(U64Traits::get(maskWords, i) == ref->valueMask().words()[nx_w] + assert(LeafDataVecTraits::get(maskWords, i) == ref->valueMask().words()[nx_w] && "cachedGetValue SIMD: maskWord mismatch"); } else { - assert(U64Traits::get(offsets, i) == uint64_t(0) + assert(LeafDataVecTraits::get(offsets, i) == uint64_t(0) && "cachedGetValue SIMD: null leaf offsets should be 0"); - assert(U64Traits::get(prefixSums, i) == uint64_t(0) + assert(LeafDataVecTraits::get(prefixSums, i) == uint64_t(0) && "cachedGetValue SIMD: null leaf prefixSums should be 0"); - assert(U64Traits::get(maskWords, i) == uint64_t(0) + assert(LeafDataVecTraits::get(maskWords, i) == uint64_t(0) && "cachedGetValue SIMD: null leaf maskWords should be 0"); } } diff --git a/simd_test/Simd.h b/simd_test/Simd.h index 59c3c8a0c0..69ca628165 100644 --- a/simd_test/Simd.h +++ b/simd_test/Simd.h @@ -366,6 +366,7 @@ NANOVDB_SIMD_HOSTDEV void gather_if(Simd& dst, SimdMask mask, // Both backends: the array backend uses a lane loop; the stdx backend uses the // generator constructor, which the compiler lowers to a vpmovsxbw / vpmovzxwd // sequence or similar sign/zero-extend instruction depending on the types. +// Scalar overload: degrades to static_cast for plain scalar types. // --------------------------------------------------------------------------- template NANOVDB_SIMD_HOSTDEV Simd simd_cast(Simd src) { @@ -377,6 +378,8 @@ NANOVDB_SIMD_HOSTDEV Simd simd_cast(Simd src) { return r; #endif } +template +NANOVDB_SIMD_HOSTDEV DstT simd_cast(SrcT src) { return static_cast(src); } // --------------------------------------------------------------------------- // simd_traits — generic per-lane access for scalar and Simd types. @@ -461,5 +464,27 @@ template NANOVDB_SIMD_HOSTDEV T where(bool m, T a, T b) { return m ? template NANOVDB_SIMD_HOSTDEV T reduce(T v, BinaryOp) { return v; } +// 2-argument where: scalar masked-assignment proxy matching the Simd form. +// where(mask, target) = value writes value into target only if mask is true. +template +struct ScalarWhereProxy { + bool mask; T& target; + NANOVDB_SIMD_HOSTDEV void operator=(const T& v) { if (mask) target = v; } +}; +template +NANOVDB_SIMD_HOSTDEV ScalarWhereProxy where(bool mask, T& target) { + return {mask, target}; +} + +// Unmasked scalar gather: result = ptr[idx]. +template +NANOVDB_SIMD_HOSTDEV T gather(const T* __restrict__ ptr, int32_t idx) { return ptr[idx]; } + +// Merge-masked scalar gather: dst = ptr[idx] only if mask, else dst unchanged. +template +NANOVDB_SIMD_HOSTDEV void gather_if(T& dst, bool mask, const T* __restrict__ ptr, int32_t idx) { + if (mask) dst = ptr[idx]; +} + } // namespace util } // namespace nanovdb From 14c9db7359155d3a080429f6fc58440dff8e1609 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Fri, 17 Apr 2026 16:37:28 -0500 Subject: [PATCH 23/60] BatchAccessor: complete SIMD cachedGetValue pipeline (Steps 1-8) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cachedGetValue is now fully vectorised end-to-end for ValueOnIndex grids. The scalar leaf->getValue(offset) loop is removed; result is filled via a 2-arg where(isActive, result) = ... directly on the output argument so that leafMask-clear and inactive-voxel lanes are never touched. Key design decisions recorded in BatchAccessor.md §8e: - tapLeafOffset_i64 widened to int64_t before *= kStride to avoid uint32_t overflow (kNullLeafID = 0xFFFFFFFF causes wild gather indices in uint32_t). simd_cast_if(dst, valid_u32, src) writes 0 for invalid lanes, keeping gather indices non-negative for vpgatherqq (signed int64_t). - gather_if gains a MaskElemT template parameter to support heterogeneous masks: valid_u32 (SimdMask) applied to uint64_t data fields. - Activity check: ValueOnIndex::getValue returns 0 for inactive voxels. Detected as isActive = (maskWords & (1< Signed-off-by: Efty Sifakis --- .../BatchAccessor.h | 147 ++++++++--------- .../BatchAccessor.md | 154 ++++++++++++++---- simd_test/Simd.h | 105 +++++++++--- 3 files changed, 276 insertions(+), 130 deletions(-) diff --git a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.h b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.h index 4362814522..3ad9c29efa 100644 --- a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.h +++ b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.h @@ -148,6 +148,9 @@ class BatchAccessor , mCenterLeafID(firstLeafID) , mCenterOrigin(grid.tree().getFirstLeaf()[firstLeafID].origin()) , mProbedMask(1u << dir(0, 0, 0)) + , mOffsetBase (reinterpret_cast(&grid.tree().getFirstLeaf()[0].data()->mOffset)) + , mPrefixBase (reinterpret_cast(&grid.tree().getFirstLeaf()[0].data()->mPrefixSum)) + , mMaskWordBase(grid.tree().getFirstLeaf()[0].valueMask().words()) { for (auto& id : mNeighborLeafIDs) id = kNullLeafID; mNeighborLeafIDs[dir(0, 0, 0)] = mCenterLeafID; @@ -357,7 +360,7 @@ class BatchAccessor const auto packed_sum = packed_lc + VoxelOffsetT(packed_tap); // dest_x per lane: bits [10:12] of packed_sum -> uint64_t mask word index (0..7) - const auto wordIndex = (packed_sum >> VoxelOffsetScalarT(10)) & VoxelOffsetT(7u); + const auto wordIdx_u16 = (packed_sum >> VoxelOffsetScalarT(10)) & VoxelOffsetT(7u); // SIMD gather of mOffset, mPrefixSum[w], and maskWords[w] per lane. // @@ -366,21 +369,21 @@ class BatchAccessor // bits lie entirely below bit 16, so the modular uint16_t product gives // the same answer as the full-width product for all valid + sentinel inputs. // - // Step 2 -- leaf_id_vec: gather mNeighborLeafIDs[d] for all lanes at once. + // Step 2 -- tapLeafID_u32: gather mNeighborLeafIDs[d] for all lanes at once. // - // Step 3 -- raw_idx: leaf_id * (sizeof(LeafT)/sizeof(uint64_t)). + // Step 3 -- tapLeafOffset_i64: leaf_id * (sizeof(LeafT)/sizeof(uint64_t)). // This is the per-lane uint64_t-stride index into the flat leaf array, // viewed as a uint64_t[] through the base pointer of the target field. // Invalid (kNullLeafID) lanes are clamped to index 0 (safe; masked out). // // Step 4 -- offsets / prefixSums: two gathers with different base pointers - // but the same raw_idx; masked to 0 for null lanes. + // but the same tapLeafOffset_i64; masked to 0 for null lanes. // mPrefixSum is a packed uint64_t: field w lives at bits [9*(w-1)+:9] // (9-bit fields, w=0 -> prefix = 0 by definition). // // Step 5 -- maskWords: gather from valueMask().words() base. // words()[wi] for leaf[leaf_id] = mask_word_base[leaf_id*kStride + wi]. - // The per-lane wi is added to raw_idx to form the mask gather index. + // The per-lane wi is added to tapLeafOffset_i64 to form the mask gather index. // Direction-extraction constants (base-32 multiply trick, Sec.8d). static constexpr uint16_t kSwarCarryMask = 0x6318u; // carry bits [3:4],[8:9],[13:14] static constexpr uint16_t kDirMul = 1129u; // base-32 multiplier: 1*32^2 + 3*32 + 9 @@ -399,48 +402,65 @@ class BatchAccessor // Step 2 -- leaf IDs: gather only for active lanes; inactive lanes keep kNullLeafID. // valid_u32 is then the combined effective mask (leafMask AND neighbor exists). - LeafIDVecT leaf_id_vec(kNullLeafID); - util::gather_if(leaf_id_vec, leafMask, mNeighborLeafIDs, d_i32); - const auto valid_u32 = (leaf_id_vec != LeafIDVecT(kNullLeafID)); // SimdMask - - // Step 3 -- stride-scaled gather indices (null lanes -> 0) - static constexpr uint32_t kStride = sizeof(LeafT) / sizeof(uint64_t); - const auto raw_idx = util::simd_cast( - util::where(valid_u32, leaf_id_vec * LeafIDVecT(kStride), LeafIDVecT(0))); + LeafIDVecT tapLeafID_u32(kNullLeafID); + util::gather_if(tapLeafID_u32, leafMask, mNeighborLeafIDs, d_i32); + const auto valid_u32 = (tapLeafID_u32 != LeafIDVecT(kNullLeafID)); // SimdMask + + // Step 3 -- stride-scaled gather indices (widened to int64_t, invalid lanes -> 0) + // kStride is sizeof(LeafT)/sizeof(uint64_t); the static_assert makes the + // divisibility assumption explicit (NanoVDB leaves are always 8-byte aligned). + static_assert(sizeof(LeafT) % sizeof(uint64_t) == 0, + "LeafT must be uint64_t-aligned for packed gather indexing"); + static constexpr int64_t kStride = int64_t(sizeof(LeafT) / sizeof(uint64_t)); + using Int64VecT = std::conditional_t>; + Int64VecT tapLeafOffset_i64(0); + util::simd_cast_if(tapLeafOffset_i64, valid_u32, tapLeafID_u32); + tapLeafOffset_i64 = tapLeafOffset_i64 * Int64VecT(kStride); // Step 4a -- offsets (mOffset) - const uint64_t* offset_base = reinterpret_cast( - &mGrid.tree().getFirstLeaf()[0].data()->mOffset); - const LeafDataVecT offsets = util::where(valid_u32, - util::gather(offset_base, raw_idx), LeafDataVecT(0)); + LeafDataVecT offsets(0); + util::gather_if(offsets, valid_u32, mOffsetBase, tapLeafOffset_i64); // Step 4b -- prefixSums (mPrefixSum packed uint64_t, shift-extract field w) - const uint64_t* prefix_base = reinterpret_cast( - &mGrid.tree().getFirstLeaf()[0].data()->mPrefixSum); - const auto prefix_raw = util::gather(prefix_base, raw_idx); - const auto w_u64 = util::simd_cast(wordIndex); - const auto nonzero_w = (w_u64 != LeafDataVecT(0)); - const auto shift = util::where(nonzero_w, (w_u64 - LeafDataVecT(1)) * LeafDataVecT(9), LeafDataVecT(0)); - const LeafDataVecT prefixSums = util::where(valid_u32, - util::where(nonzero_w, (prefix_raw >> shift) & LeafDataVecT(511u), LeafDataVecT(0)), - LeafDataVecT(0)); + // Invalid lanes have prefixSums=0 after gather_if; (0>>shift)&511=0 for any shift, + // so the outer valid_u32 guard from before is not needed. + LeafDataVecT prefixSums(0); + util::gather_if(prefixSums, valid_u32, mPrefixBase, tapLeafOffset_i64); + const auto wordIdx_u64 = util::simd_cast(wordIdx_u16); + const auto nonzero_w = (wordIdx_u64 != LeafDataVecT(0)); + const auto shift = util::where(nonzero_w, (wordIdx_u64 - LeafDataVecT(1)) * LeafDataVecT(9), LeafDataVecT(0)); + prefixSums = util::where(nonzero_w, (prefixSums >> shift) & LeafDataVecT(511u), LeafDataVecT(0)); // Step 5 -- maskWords (valueMask().words()[w]) - // mask_word_base[leaf_id*kStride + w] == leaf[leaf_id].valueMask().words()[w] + // mMaskWordBase[leaf_id*kStride + w] == leaf[leaf_id].valueMask().words()[w] // because the mask field is at a fixed offsetof within every LeafT. - const uint64_t* mask_word_base = - mGrid.tree().getFirstLeaf()[0].valueMask().words(); - const auto w_i32 = util::simd_cast(util::simd_cast(wordIndex)); - const auto mask_idx = raw_idx + w_i32; - const LeafDataVecT maskWords = util::where(valid_u32, - util::gather(mask_word_base, mask_idx), LeafDataVecT(0)); + const auto wordIdx_i64 = util::simd_cast(wordIdx_u16); + const auto mask_idx = tapLeafOffset_i64 + wordIdx_i64; + LeafDataVecT maskWords(0); + util::gather_if(maskWords, valid_u32, mMaskWordBase, mask_idx); + // Step 6 -- dest_yz: 6-bit intra-word bit position (ny_w*8 + nz_w). + // packed_sum bits [5:7] = dest_y, bits [0:2] = dest_z (both wrapped mod 8). + const auto dest_yz_u16 = ((packed_sum >> VoxelOffsetScalarT(2)) & VoxelOffsetT(0x38u)) + | (packed_sum & VoxelOffsetT(0x07u)); + const auto dest_yz_u64 = util::simd_cast(dest_yz_u16); + + // Step 7 -- activity check + truncated maskWord. + // If voxel dest_yz is inactive, getValue returns 0 (not the formula below). + // Null-leaf lanes already have maskWords=0, so they are implicitly inactive. + const auto voxelBit = LeafDataVecT(1) << dest_yz_u64; + const auto isActive = (maskWords & voxelBit) != LeafDataVecT(0); + const auto truncated = maskWords & (voxelBit - LeafDataVecT(1)); + + // Step 8 -- fill result in-place; leafMask-clear lanes are untouched. + util::where(isActive, result) = offsets + prefixSums + util::popcount(truncated); + // ------------------------------------------------------------------- // Debug cross-check: validate SIMD-path values against scalar ref // ------------------------------------------------------------------- #ifndef NDEBUG using LeafDataVecTraits = util::simd_traits; for (int i = 0; i < LaneWidth; ++i) { - if (!Pred_traits::get(leafMask, i)) continue; + if (!Pred_traits::get(leafMask, i)) continue; // only check lanes caller asked about // Scalar reference: same arithmetic as the legacy loop below const auto vo_i = static_cast(VO_traits::get(vo, i)); @@ -449,8 +469,10 @@ class BatchAccessor const int dx = (nx < 0) ? -1 : (nx >= 8) ? 1 : 0; const int dy = (ny < 0) ? -1 : (ny >= 8) ? 1 : 0; const int dz = (nz < 0) ? -1 : (nz >= 8) ? 1 : 0; - const int d_ref = dir(dx, dy, dz); - const int nx_w = nx - dx * 8; // = dest_x = word index w + const int d_ref = dir(dx, dy, dz); + const int nx_w = nx - dx * 8; + const int ny_w = ny - dy * 8; + const int nz_w = nz - dz * 8; const uint32_t ref_id = mNeighborLeafIDs[d_ref]; const LeafT* ref = (ref_id != kNullLeafID) ? &mGrid.tree().getFirstLeaf()[ref_id] : nullptr; @@ -458,7 +480,7 @@ class BatchAccessor // SIMD-path values for this lane const uint32_t ps_i = static_cast(VO_traits::get(packed_sum, i)); const int d_simd = int((((ps_i & 0x6318u) >> 3) * 1129u >> 10) & 31u); - const int wi = int(VO_traits::get(wordIndex, i)); + const int wi = int(VO_traits::get(wordIdx_u16, i)); assert(d_simd == d_ref && "cachedGetValue SIMD: dir mismatch"); assert(wi == nx_w && "cachedGetValue SIMD: w (dest_x) mismatch"); @@ -467,12 +489,15 @@ class BatchAccessor const uint64_t pfx_ref = (uint32_t(nx_w) > 0u) ? (ref->data()->mPrefixSum >> (9u * (uint32_t(nx_w) - 1u))) & 511u : uint64_t(0); + const uint32_t ref_offset = uint32_t(nx_w)*64u + uint32_t(ny_w)*8u + uint32_t(nz_w); assert(LeafDataVecTraits::get(offsets, i) == ref->data()->mOffset && "cachedGetValue SIMD: mOffset mismatch"); assert(LeafDataVecTraits::get(prefixSums, i) == pfx_ref && "cachedGetValue SIMD: mPrefixSum mismatch"); assert(LeafDataVecTraits::get(maskWords, i) == ref->valueMask().words()[nx_w] && "cachedGetValue SIMD: maskWord mismatch"); + assert(Val_traits::get(result, i) == static_cast(ref->getValue(ref_offset)) + && "cachedGetValue SIMD: final result mismatch"); } else { assert(LeafDataVecTraits::get(offsets, i) == uint64_t(0) && "cachedGetValue SIMD: null leaf offsets should be 0"); @@ -480,42 +505,11 @@ class BatchAccessor && "cachedGetValue SIMD: null leaf prefixSums should be 0"); assert(LeafDataVecTraits::get(maskWords, i) == uint64_t(0) && "cachedGetValue SIMD: null leaf maskWords should be 0"); + assert(Val_traits::get(result, i) == uint64_t(0) + && "cachedGetValue SIMD: null leaf result should be 0"); } } #endif - (void)offsets; (void)prefixSums; (void)maskWords; - } - - // ----------------------------------------------------------------------- - // Legacy scalar path -- authoritative until SIMD path is wired in - // ----------------------------------------------------------------------- - for (int i = 0; i < LaneWidth; ++i) { - if (!Pred_traits::get(leafMask, i)) continue; - const auto vo_i = static_cast(VO_traits::get(vo, i)); - const int lx = (vo_i >> 6) & 7; - const int ly = (vo_i >> 3) & 7; - const int lz = vo_i & 7; - const int nx = lx + di, ny = ly + dj, nz = lz + dk; - const int dx = (nx < 0) ? -1 : (nx >= 8) ? 1 : 0; - const int dy = (ny < 0) ? -1 : (ny >= 8) ? 1 : 0; - const int dz = (nz < 0) ? -1 : (nz >= 8) ? 1 : 0; - // Wrapped local coordinate within the neighbor leaf. - const int nx_w = nx - dx * 8; - const int ny_w = ny - dy * 8; - const int nz_w = nz - dz * 8; - // NanoVDB leaf layout: offset = lx*64 + ly*8 + lz. - const uint32_t offset = uint32_t(nx_w) * 64u - + uint32_t(ny_w) * 8u - + uint32_t(nz_w); - const int d = dir(dx, dy, dz); - assert((mProbedMask & (1u << d)) && "cachedGetValue: direction not prefetched"); - const uint32_t leaf_id = mNeighborLeafIDs[d]; - const LeafT* leaf = (leaf_id != kNullLeafID) - ? &mGrid.tree().getFirstLeaf()[leaf_id] : nullptr; - const ScalarValueT val = leaf - ? static_cast(leaf->getValue(offset)) - : ScalarValueT(0); - Val_traits::set(result, i, val); } } @@ -530,11 +524,14 @@ class BatchAccessor return mCenterOrigin + Coord(dx * 8, dy * 8, dz * 8); } - const GridT& mGrid; - uint32_t mCenterLeafID; - Coord mCenterOrigin; - uint32_t mProbedMask; - uint32_t mNeighborLeafIDs[27]; // kNullLeafID when not probed or outside narrow band + const GridT& mGrid; + uint32_t mCenterLeafID; + Coord mCenterOrigin; + uint32_t mProbedMask; + uint32_t mNeighborLeafIDs[27]; // kNullLeafID when not probed or outside narrow band + const uint64_t* const mOffsetBase; // &getFirstLeaf()[0].data()->mOffset + const uint64_t* const mPrefixBase; // &getFirstLeaf()[0].data()->mPrefixSum + const uint64_t* const mMaskWordBase; // getFirstLeaf()[0].valueMask().words() }; } // namespace nanovdb diff --git a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.md b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.md index 6b3e98dd7d..1bcc17dfbb 100644 --- a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.md +++ b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.md @@ -38,7 +38,6 @@ subsequent batches in the same center leaf. template class BatchAccessor; ``` @@ -46,10 +45,9 @@ class BatchAccessor; | Parameter | Scalar default | SIMD example | Role | |-----------|---------------|--------------|------| | `BuildT` | — | — | NanoVDB build type; determines `LeafT`, `TreeT` | -| `ValueT` | `float` | `Simd` | Return type of `cachedGetValue` | +| `ValueT` | `float` | `Simd` | Result type of `cachedGetValue` | | `VoxelOffsetT` | `uint16_t` | `Simd` | Compact 9-bit voxel offset within a leaf | -| `LeafIDT` | `uint32_t` | `Simd` | Per-lane leaf ID (reserved for caller loop) | -| `PredicateT` | `bool` | `SimdMask` | Per-lane active predicate | +| `PredicateT` | `bool` | `SimdMask` | Per-lane active predicate | For `NanoGrid`, use `ValueT = uint64_t` (scalar) or `ValueT = Simd` (SIMD). @@ -64,14 +62,18 @@ which works for both scalar and vector types via specialisation. ## 3. Persistent State -Four members persist across batches within one center leaf: +Members that persist across batches within one center leaf: ```cpp -const GridT& mGrid; // for probeLeaf calls via mGrid.tree() -uint32_t mCenterLeafID; // index of current center leaf -Coord mCenterOrigin; // world-space origin of current center leaf -uint32_t mProbedMask = (1u << 13); // bit 13 (center) pre-set at construction -uint32_t mNeighborLeafIDs[27]; // kNullLeafID when outside narrow band or unprobed +const GridT& mGrid; // for probeLeaf calls via mGrid.tree() +uint32_t mCenterLeafID; // index of current center leaf +Coord mCenterOrigin; // world-space origin of current center leaf +uint32_t mProbedMask; // bit 13 (center) pre-set at construction +uint32_t mNeighborLeafIDs[27]; // kNullLeafID when outside narrow band or unprobed + +const uint64_t* const mOffsetBase; // &getFirstLeaf()[0].data()->mOffset +const uint64_t* const mPrefixBase; // &getFirstLeaf()[0].data()->mPrefixSum +const uint64_t* const mMaskWordBase; // getFirstLeaf()[0].valueMask().words() ``` **Direction encoding** (`dir` is a `static constexpr` member): @@ -90,12 +92,17 @@ static constexpr uint32_t kNullLeafID = ~uint32_t(0); ``` **Why leaf IDs, not pointers:** `cachedGetValue` fetches `mOffset`, `mPrefixSum`, and -`valueMask().words()[w]` for all active lanes via SIMD gathers (§8d). The gather index -is `leaf_id × (sizeof(LeafT)/sizeof(uint64_t))`, computed once per call as a -`Simd` multiply. Storing IDs enables a single flat-base gather over the -contiguous leaf array; storing pointers would require per-lane pointer arithmetic that -doesn't map to `vgatherdpd` / `vpgatherqq`. The `kNullLeafID` sentinel cleanly -replaces `nullptr` and is masked out in the gather via `where`. +`valueMask().words()[w]` for all active lanes via SIMD gathers (§8d–§8e). The gather index +is `leaf_id × (sizeof(LeafT)/sizeof(uint64_t))`, computed as a `Simd` (see §8e). +Storing IDs enables a single flat-base gather over the contiguous leaf array; storing +pointers would require per-lane pointer arithmetic that doesn't map to `vpgatherqq`. +The `kNullLeafID` sentinel is masked out before any gather via `valid_u32` (§8e). + +**Class-level base pointers:** `mOffsetBase`, `mPrefixBase`, and `mMaskWordBase` are +`const` pointers computed once in the constructor from `getFirstLeaf()[0]`. They are +invariant over the lifetime of the accessor (the leaf array is fixed after grid construction) +and are shared across all 18 `cachedGetValue` instantiations in a WENO5 gather, avoiding +the equivalent recomputation in every call. **Cache advance:** when `none_of(leafMask)` fires in the outer loop: @@ -209,13 +216,20 @@ template void cachedGetValue(ValueT& result, VoxelOffsetT vo, PredicateT leafMask) const; ``` -- Fills **only the `leafMask` lanes** of `result` (by reference). -- Inactive lanes are not touched — values from a previous iteration are preserved. -- This is the correct API for the straddle-aware outer loop: the caller declares - all stencil result variables before the `while` loop, fills them progressively - across iterations, and calls the kernel once after `activeMask` is empty. +- Fills **only the `leafMask` lanes** of `result` (by reference) via a 2-arg `where` + directly on `result` — no intermediate copy, no write-back. +- `leafMask`-clear lanes are **not touched**: values from a previous iteration are + preserved exactly as the caller left them. +- Additionally, lanes for which the tap voxel is inactive (outside the narrow band + within an existing neighbor leaf) are also not written; `result` retains whatever + default the caller initialised it to (typically 0 for a zero-initialized stencil + buffer, matching `ValueOnIndex::getValue`'s return of 0 for inactive voxels). +- This contract suits the straddle-aware outer loop: the caller declares stencil + result variables (zero-initialised) before the `while` loop, fills them + progressively across iterations, and calls the kernel once after `activeMask` is empty. - Requires the corresponding direction to be in `mProbedMask` (asserted in debug). -- `nullptr` leaf (outside narrow band) writes `ScalarValueT(0)`. +- `kNullLeafID` leaf (neighbor outside the narrow band entirely) also leaves `result` + untouched, for the same reason: `maskWords = 0` → `isActive = false`. ### 6e. Deferred @@ -290,12 +304,12 @@ vpextrw $0x0,%xmm1,%eax ; / scalar hor_and After the scalar crossing check, `probeLeaf` is called at most once per unique direction per center leaf — inherently scalar tree traversal, not per-voxel. -**`cachedGetValue` — SIMD ingredient fetch, scalar value path** +**`cachedGetValue` — fully SIMD, no scalar loop** -The ingredient-fetch block — `mOffset`, `mPrefixSum[w]`, and `valueMask().words()[w]` -for all active lanes — is **fully SIMD** via the gather chain described in §8d. -The final value-fetch (scalar loop over `leaf->getValue(offset)`) is the remaining -work before the full SIMD index pipeline is wired in. +`cachedGetValue` is fully vectorised end-to-end. The scalar `leaf->getValue(offset)` +loop has been replaced by the gather chain described in §8e. The result is written +directly to `result` via a 2-arg `where(isActive, result) = ...` — no intermediate +variable, no write-back copy. ### 8b. No tree accessor in prefetch @@ -379,6 +393,76 @@ for (int cx : {0,1,2}) for (int cy : {0,1,2}) for (int cz : {0,1,2}) { } ``` +### 8e. `cachedGetValue` gather pipeline — Steps 1–8 + +`cachedGetValue` recomputes `packed_sum` identically to `prefetch` (§8a), then runs +the following fully-SIMD pipeline. All types are SIMD vectors of the indicated element +type; scalar `LaneWidth==1` degrades to plain scalar types. + +``` +Step 1 — d_vec (Simd) + base-32 multiply trick (§8d): per-lane dir ∈ [0,26] + +Step 2 — tapLeafID_u32 (Simd) + gather_if(tapLeafID_u32, leafMask, mNeighborLeafIDs, d_vec) + valid_u32 = (tapLeafID_u32 != kNullLeafID) ← effective mask for steps 3–5 + +Step 3 — tapLeafOffset_i64 (Simd) + simd_cast_if(tapLeafOffset_i64, valid_u32, tapLeafID_u32) + tapLeafOffset_i64 *= kStride (kStride = sizeof(LeafT)/sizeof(uint64_t)) + + Widening to int64_t is required: uint32_t * kStride overflows for large leaf + pools (kNullLeafID = 0xFFFFFFFF). simd_cast_if writes 0 for invalid lanes, + keeping gather indices non-negative. x86 vpgatherqq treats indices as signed + int64_t, so negative values would access memory before the base pointer. + +Step 4a — offsets (Simd) + gather_if(offsets, valid_u32, mOffsetBase, tapLeafOffset_i64) + → leaf->mOffset for each valid lane + +Step 4b — prefixSums (Simd) + gather_if(prefixSums, valid_u32, mPrefixBase, tapLeafOffset_i64) + Extract field w from packed mPrefixSum: + shift = (w > 0) ? (w-1)*9 : 0 + prefixSums = (w > 0) ? (prefixSums >> shift) & 511 : 0 + + mPrefixSum packs 7 nine-bit prefix counts in one uint64_t: + field w (1..7) at bits [9*(w-1) +: 9]; field 0 is defined as 0 (empty prefix). + +Step 5 — maskWords (Simd) + mask_idx = tapLeafOffset_i64 + simd_cast(wordIdx_u16) + gather_if(maskWords, valid_u32, mMaskWordBase, mask_idx) + → valueMask().words()[w] for each valid lane + + Heterogeneous mask: valid_u32 is SimdMask applied to uint64_t data. + Implemented via MaskElemT template parameter on gather_if in Simd.h. + +Step 6 — dest_yz (Simd) + dest_yz = ((packed_sum >> 2) & 0x38) | (packed_sum & 0x07) + → ny_w*8 + nz_w (6-bit intra-word bit position, range [0,63]) + +Step 7 — activity check + truncated maskWord + voxelBit = 1u64 << dest_yz + isActive = (maskWords & voxelBit) != 0 + truncated = maskWords & (voxelBit - 1) + + ValueOnIndex::getValue returns 0 for inactive voxels (bit not set in valueMask). + Null-leaf lanes have maskWords=0, so isActive=false there too — no explicit + valid_u32 guard is needed at this step. + +Step 8 — fill result + where(isActive, result) = offsets + prefixSums + popcount(truncated) + + 2-arg where writes only active lanes; leafMask-clear and inactive-voxel lanes + are untouched. +``` + +**popcount choice:** `popcount(Simd)` uses a SWAR shift-and-add tree +(`popcount64` in `Simd.h`) rather than `__builtin_popcountll`. AVX2 lacks a +64-bit lane-wise popcount (VPOPCNTQ is AVX-512DQ); `__builtin_popcountll` maps to +the scalar `popcnt` instruction, which is not vectorisable. The SWAR tree uses only +`vpsrlq` / `vpand` / `vpaddq`, which are all AVX2-native. + --- ## 9. Relationship to Phase 1 Prototype @@ -396,13 +480,17 @@ for (int cx : {0,1,2}) for (int cy : {0,1,2}) for (int cz : {0,1,2}) { --- -## 10. Future Work +## 10. Status and Future Work + +### Completed + +- `prefetch`: fully SIMD crossing detection, lazy probeLeaf. +- `cachedGetValue`: fully SIMD end-to-end (Steps 1–8, §8e). + Verified against scalar reference over 12M lane-checks across all 18 WENO5 taps. +- Class-level base pointers (`mOffsetBase`, `mPrefixBase`, `mMaskWordBase`). +- `simd_cast_if`, heterogeneous `gather_if`, `popcount64`/`popcount` added to `Simd.h`. -- **`cachedGetValue` vectorisation (Phase 2):** ingredient fetch (`mOffset`, - `mPrefixSum[w]`, `valueMask().words()[w]`) is now fully SIMD via the gather chain - in §8d. Remaining: popcount `(maskWord & partial_mask)` → global value index → - `gather_if(result, leafMask, globalValueArray, indices)` to replace the scalar - `leaf->getValue(offset)` loop. See `StencilGather.md §7b` for the AVX2 profile. +### Remaining - **`getValue`:** lazy combined `prefetch` + `cachedGetValue`. diff --git a/simd_test/Simd.h b/simd_test/Simd.h index 69ca628165..c6e8bdc68b 100644 --- a/simd_test/Simd.h +++ b/simd_test/Simd.h @@ -130,30 +130,36 @@ template inline bool all_of(SimdMask m) { return stdx::all_of(m); } // Unmasked gather: result[i] = ptr[idx[i]] for all lanes. -// Expressed as a generator constructor — Clang lowers to vgatherdps (all-ones mask). -template -inline Simd gather(const T* __restrict__ ptr, Simd idx) { +// IdxT may be int32_t or int64_t; the compiler selects the matching hardware +// instruction (vpgatherdps/vpgatherdq for 32-bit idx, vpgatherqq for 64-bit idx). +template +inline Simd gather(const T* __restrict__ ptr, Simd idx) { return Simd([&](int i) { return ptr[idx[i]]; }); } // Masked gather: result[i] = mask[i] ? ptr[idx[i]] : fallback. // Implemented as a full gather + where-blend; ptr is accessed for ALL lanes, // so every idx[i] must be a valid offset regardless of mask[i]. -template +template inline Simd gather(SimdMask mask, const T* __restrict__ ptr, - Simd idx, T fallback = T(0)) { + Simd idx, T fallback = T(0)) { auto result = Simd(fallback); stdx::where(mask, result) = Simd([&](int i) { return ptr[idx[i]]; }); return result; } // Merge-masked gather: dst[i] = mask[i] ? ptr[idx[i]] : dst[i] (unchanged). -// Mirrors vgatherdps merge-masking semantics: dst is both input and output. -// Hope: compiler emits a single vgatherdps with dst as the destination register. -template -inline void gather_if(Simd& dst, SimdMask mask, - const T* __restrict__ ptr, Simd idx) { - stdx::where(mask, dst) = Simd([&](int i) { return ptr[idx[i]]; }); +// MaskElemT may differ from T (heterogeneous mask, e.g. SimdMask +// applied to Simd data). When T==MaskElemT, delegates directly +// to stdx::where; otherwise uses the WhereExpression boolean round-trip. +template +inline void gather_if(Simd& dst, SimdMask mask, + const T* __restrict__ ptr, Simd idx) { + if constexpr (std::is_same_v) { + stdx::where(mask, dst) = Simd([&](int i) { return ptr[idx[i]]; }); + } else { + where(mask, dst) = Simd([&](int i) { return ptr[idx[i]]; }); + } } // =========================================================================== @@ -330,8 +336,8 @@ NANOVDB_SIMD_HOSTDEV bool all_of(SimdMask m) { } // Unmasked gather: result[i] = ptr[idx[i]] for all lanes. -template -NANOVDB_SIMD_HOSTDEV Simd gather(const T* __restrict__ ptr, Simd idx) { +template +NANOVDB_SIMD_HOSTDEV Simd gather(const T* __restrict__ ptr, Simd idx) { Simd r; for (int i = 0; i < W; i++) r[i] = ptr[idx[i]]; return r; @@ -339,19 +345,20 @@ NANOVDB_SIMD_HOSTDEV Simd gather(const T* __restrict__ ptr, Simd // Masked gather: result[i] = mask[i] ? ptr[idx[i]] : fallback. // Scalar path: accesses ptr only for true lanes (ternary short-circuits). -template +template NANOVDB_SIMD_HOSTDEV Simd gather(SimdMask mask, const T* __restrict__ ptr, - Simd idx, T fallback = T(0)) { + Simd idx, T fallback = T(0)) { Simd r; for (int i = 0; i < W; i++) r[i] = mask[i] ? ptr[idx[i]] : fallback; return r; } // Merge-masked gather: dst[i] = mask[i] ? ptr[idx[i]] : dst[i] (unchanged). +// MaskElemT may differ from T (heterogeneous mask). // Scalar path: only accesses ptr for true lanes. -template -NANOVDB_SIMD_HOSTDEV void gather_if(Simd& dst, SimdMask mask, - const T* __restrict__ ptr, Simd idx) { +template +NANOVDB_SIMD_HOSTDEV void gather_if(Simd& dst, SimdMask mask, + const T* __restrict__ ptr, Simd idx) { for (int i = 0; i < W; i++) if (mask[i]) dst[i] = ptr[idx[i]]; } @@ -381,6 +388,60 @@ NANOVDB_SIMD_HOSTDEV Simd simd_cast(Simd src) { template NANOVDB_SIMD_HOSTDEV DstT simd_cast(SrcT src) { return static_cast(src); } +// --------------------------------------------------------------------------- +// simd_cast_if — masked element-wise cast (merge-masked). +// +// dst[i] = mask[i] ? static_cast(src[i]) : dst[i] (unchanged) +// +// Typical use: widen an integer index type into a wider type before arithmetic, +// keeping invalid (masked-out) lanes at their initial value (usually 0). +// On AVX-512 the compiler may emit a single masked vcvt/vpmovzx instruction. +// On AVX2 it lowers to an unmasked cast + blend. +// +// Scalar fallback: plain conditional cast. +// --------------------------------------------------------------------------- +template +NANOVDB_SIMD_HOSTDEV void simd_cast_if(Simd& dst, SimdMask mask, Simd src) { + dst = where(mask, simd_cast(src), dst); +} +template +NANOVDB_SIMD_HOSTDEV void simd_cast_if(DstT& dst, bool mask, SrcT src) { + if (mask) dst = static_cast(src); +} + +// --------------------------------------------------------------------------- +// popcount64 — scalar SWAR popcount, always uses arithmetic (no __builtin_popcountll). +// +// Safe to call per-lane inside a vectorizable loop: every operation (>>, &, +, -) +// maps to an AVX2 instruction for 64-bit elements (vpsrlq, vpand, vpaddq, vpsubq). +// The final byte-sum uses a shift-and-add tree instead of the multiply trick +// (v * 0x0101...) since 64x64->64 multiply has no AVX2 equivalent (vpmullq is AVX-512). +// --------------------------------------------------------------------------- +NANOVDB_SIMD_HOSTDEV inline uint64_t popcount64(uint64_t v) +{ + v -= (v >> 1) & uint64_t(0x5555555555555555); + v = (v & uint64_t(0x3333333333333333)) + ((v >> 2) & uint64_t(0x3333333333333333)); + v = (v + (v >> 4)) & uint64_t(0x0F0F0F0F0F0F0F0F); // per-byte counts + v += v >> 8; v &= uint64_t(0x00FF00FF00FF00FF); + v += v >> 16; v &= uint64_t(0x0000FFFF0000FFFF); + v += v >> 32; + return v & uint64_t(63); +} + +// Lane-wise SIMD popcount: applies popcount64 to every lane. +// Backend A: generator constructor; Backend B: element loop (auto-vectorized by GCC/Clang). +template +NANOVDB_SIMD_HOSTDEV Simd popcount(Simd v) { +#ifdef NANOVDB_USE_STD_SIMD + return Simd([&](int i) { return popcount64(v[i]); }); +#else + Simd r; + for (int i = 0; i < W; ++i) r[i] = popcount64(v[i]); + return r; +#endif +} +NANOVDB_SIMD_HOSTDEV inline uint64_t popcount(uint64_t v) { return popcount64(v); } + // --------------------------------------------------------------------------- // simd_traits — generic per-lane access for scalar and Simd types. // @@ -477,12 +538,12 @@ NANOVDB_SIMD_HOSTDEV ScalarWhereProxy where(bool mask, T& target) { } // Unmasked scalar gather: result = ptr[idx]. -template -NANOVDB_SIMD_HOSTDEV T gather(const T* __restrict__ ptr, int32_t idx) { return ptr[idx]; } +template +NANOVDB_SIMD_HOSTDEV T gather(const T* __restrict__ ptr, IdxT idx) { return ptr[idx]; } // Merge-masked scalar gather: dst = ptr[idx] only if mask, else dst unchanged. -template -NANOVDB_SIMD_HOSTDEV void gather_if(T& dst, bool mask, const T* __restrict__ ptr, int32_t idx) { +template +NANOVDB_SIMD_HOSTDEV void gather_if(T& dst, bool mask, const T* __restrict__ ptr, IdxT idx) { if (mask) dst = ptr[idx]; } From 16b43acfbfca912c27658ad94df25185da88f45e Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Fri, 17 Apr 2026 16:46:41 -0500 Subject: [PATCH 24/60] BatchAccessor.md: document GCC/Clang codegen findings for cachedGetValue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add §8f with assembly comparison between GCC 13 and Clang 18 (-O3 -DNDEBUG -march=native, cachedGetValue<1,0,0>, W=16): - GCC does not inline any Simd.h helper (gather_if, simd_cast, where, popcount); emits 14 out-of-line calls and 13 vzeroupper transitions. gather_if body uses scalar vmovq/vpinsrq/vinserti128 — no hardware gathers. - Clang inlines everything except popcount; emits 2 vpgatherdd + 12 vpgatherqq hardware gather instructions and only 2 vzeroupper transitions. popcount body (88 ymm instrs, pure SWAR vpsrlq/vpand/vpaddq) is also fully vectorized but remains out-of-line. - 43 vpinsrb in Clang output are mask-widening cost for heterogeneous gather_if (SimdMask → 4x SimdMask). Action item added to §10: [[gnu::always_inline]] on Simd.h helpers would eliminate the GCC regression and fold popcount inline in both compilers. Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Efty Sifakis --- .../BatchAccessor.md | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.md b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.md index 1bcc17dfbb..eea09f26ff 100644 --- a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.md +++ b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.md @@ -463,6 +463,52 @@ Step 8 — fill result the scalar `popcnt` instruction, which is not vectorisable. The SWAR tree uses only `vpsrlq` / `vpand` / `vpaddq`, which are all AVX2-native. +### 8f. Assembly codegen — GCC 13 vs Clang 18 + +Platform: x86-64, AVX2, `-O3 -DNDEBUG -march=native -std=c++17`. +Representative instantiation: `cachedGetValue<1,0,0>` (x+1 tap, W=16). + +| Metric | GCC 13 | Clang 18 | +|--------|--------|----------| +| Total instructions | 465 | 535 | +| Vector (ymm) | 237 | 262 | +| `call` | 14 | **1** | +| `vzeroupper` | 13 | **2** | +| Hardware gather instructions | **0** | **14** | +| `gather_if` inlined | No | **Yes** | +| `simd_cast` / `where` inlined | No | **Yes** | +| `popcount` inlined | No | No | + +**GCC** emits every `Simd.h` helper (`gather_if`, `simd_cast`, `simd_cast_if`, `where`, +`popcount`) as an out-of-line call. The `gather_if` body does scalar element-by-element +loads via `vmovq` + `vpinsrq` + `vinserti128` — no hardware gather instructions. Each +out-of-line call forces a `vzeroupper` transition (≈80 cycles on many µarchs), giving 13 +such transitions per `cachedGetValue` call. + +**Clang** inlines everything except `popcount`. With `gather_if` inlined, Clang emits +actual hardware gather instructions: + +``` +vpgatherdd — 2× for the uint32_t tapLeafID gather (Step 2, 8-wide × 2 = 16 lanes) +vpgatherqq — 12× for the three uint64_t gathers (Steps 4a, 4b, 5: 4-wide × 4 = 16 lanes each) +``` + +Only 2 `vzeroupper` transitions remain (function entry/exit). + +The 43 `vpinsrb` instructions in the Clang output are the mask-format conversion cost +for the heterogeneous `gather_if` mask: `SimdMask` must be widened to +4 × `SimdMask` to drive `vpgatherqq`'s sign-bit mask mechanism. + +**`popcount`** body (out-of-line in both compilers): 88 instructions, 85 ymm. +The SWAR shift-and-add tree is fully vectorized with `vpsrlq`, `vpand`, `vpsubq`, +`vpaddq` — exactly the AVX2-friendly instruction set targeted in §8e. Adding +`[[gnu::always_inline]]` to `util::popcount` in `Simd.h` would fold these 88 +instructions inline and eliminate the last `callq`. + +**Action:** GCC inlining can be forced across the board with `[[gnu::always_inline]]` +on `gather_if`, `simd_cast`, `simd_cast_if`, `where`, and `popcount` in `Simd.h`. +This is a pure-upside change for GCC; Clang already inlines all but `popcount`. + --- ## 9. Relationship to Phase 1 Prototype @@ -492,6 +538,10 @@ the scalar `popcnt` instruction, which is not vectorisable. The SWAR tree uses ### Remaining +- **`[[gnu::always_inline]]` on `Simd.h` helpers:** `gather_if`, `simd_cast`, + `simd_cast_if`, `where`, `popcount` — eliminates 13 `vzeroupper` transitions per + `cachedGetValue` call under GCC 13 (§8f). + - **`getValue`:** lazy combined `prefetch` + `cachedGetValue`. - **Runtime `Coord` overload:** for generic stencil adapters iterating over an offset From 1c93d7863ba8004ad1b47e490fd61279337feafe Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Fri, 17 Apr 2026 16:50:44 -0500 Subject: [PATCH 25/60] BatchAccessor.md: document popcount alternatives and scalar popcnt tradeoffs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Expand §8f with analysis of three popcount strategies for the 16-lane uint64_t case: - SWAR shift-and-add tree (current): 88 instructions, all AVX2-friendly but port-intensive; stays in vector registers throughout. - Scalar popcnt with extract/reassemble: ~56 instructions; popcnt is pipelined (1/cycle throughput on port 1, 3-cycle latency) so 16 independent lanes retire in ~16 cycles, but the ymm↔GPR domain crossing adds ~2 cycles bypass latency per extraction and port 1 serialises all 16 popcnts. - vpshufb nibble-LUT + vpsadbw (recommended): ~40 instructions, no domain crossing, uses ports 0/5 and 5 (orthogonal to SWAR ports), standard compiler-generated AVX2 popcount pattern. Added as action item in §10. Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Efty Sifakis --- .../BatchAccessor.md | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.md b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.md index eea09f26ff..b63d285513 100644 --- a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.md +++ b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.md @@ -509,6 +509,34 @@ instructions inline and eliminate the last `callq`. on `gather_if`, `simd_cast`, `simd_cast_if`, `where`, and `popcount` in `Simd.h`. This is a pure-upside change for GCC; Clang already inlines all but `popcount`. +**`popcount` alternative — `vpshufb`-based nibble popcount:** +The current SWAR shift-and-add tree (88 instructions, §8e) avoids the scalar `popcnt` +instruction because it is not vectorisable into `VPOPCNTQ` on AVX2. There are two +other options worth considering: + +*Scalar `popcnt` with extract/reassemble:* `popcnt` is pipelined (Skylake+: 3-cycle +latency, 1/cycle throughput on port 1; 16 independent lanes retire in ~16 cycles). +The catch is the vector↔scalar domain crossing: extracting 16 uint64_t from 4 ymm +registers requires ~20 `vpextrq`/`vextracti128` instructions, and reassembly costs +another ~20 `vmovq`/`vpinsrq`/`vinserti128`. Total ≈ 56 instructions — fewer than +SWAR, but the bypass latency penalty (~2 cycles per ymm→GPR crossing on Skylake) +reduces the advantage, and port 1 serialises all 16 `popcnt`s. + +*`vpshufb`-based nibble popcount (recommended):* Stays entirely in vector registers, +no domain crossing, and shrinks the body to ≈ 40 instructions: + +``` +lo = v & 0x0F0F0F0F0F0F0F0F (vpand) +hi = (v >> 4) & 0x0F0F0F0F0F0F0F0F (vpsrlq + vpand) +bpop = vpshufb(lut, lo) + vpshufb(lut, hi) (2× vpshufb + vpaddq) +sum = vpsadbw(bpop, zero) (horizontal byte-sum → 64-bit lane result) +``` + +`vpshufb` and `vpsadbw` use ports 0/5 and port 5 respectively — orthogonal to the +arithmetic-heavy SWAR ports — so the `vpshufb` path is also more friendly to +out-of-order overlap with surrounding code. This is the standard compiler-generated +AVX2 popcount pattern and the likely replacement for `popcount64` in `Simd.h`. + --- ## 9. Relationship to Phase 1 Prototype @@ -542,6 +570,10 @@ This is a pure-upside change for GCC; Clang already inlines all but `popcount`. `simd_cast_if`, `where`, `popcount` — eliminates 13 `vzeroupper` transitions per `cachedGetValue` call under GCC 13 (§8f). +- **`vpshufb`-based `popcount` in `Simd.h`:** replace `popcount64` SWAR tree with + nibble-LUT + `vpsadbw` pattern (§8f); reduces the out-of-line body from 88 to ≈40 + instructions and uses orthogonal execution ports. + - **`getValue`:** lazy combined `prefetch` + `cachedGetValue`. - **Runtime `Coord` overload:** for generic stencil adapters iterating over an offset From 0b7524e41764d4d369d836914efc03d804110dd2 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Sat, 18 Apr 2026 01:26:11 -0500 Subject: [PATCH 26/60] StencilAccessor: SIMD batch stencil-index gatherer for CPU WENO5 Phase 1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds StencilAccessor — a compile-time-parameterized SIMD wrapper around BatchAccessor that owns the straddling loop, hull-prefetch sequencing, and per-tap cachedGetValue blending for one VBM block. Output is mIndices[SIZE] of Simd, one vector per stencil tap. Includes Weno5Stencil (18 taps, 6-tap hull) and the findIndex constexpr fold for compile-time getValue() inverse-map lookup. Also: - BatchAccessor.h: add centerLeafID() getter used by StencilAccessor - BatchAccessor.md: expand §8f assembly matrix (compiler × backend × ISA) - stencil_gather_cpu.cpp: wire StencilAccessor into runPrototype with verifyStencilAccessor correctness checks; add rdtsc-based runPerf - Simd.h: remove = {} default argument from element_aligned_tag overloads - CLAUDE.md: add project build/architecture reference for Claude Code Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Efty Sifakis --- CLAUDE.md | 119 ++++ .../stencil_gather_cpu.cpp | 195 ++++++- .../BatchAccessor.h | 38 +- .../BatchAccessor.md | 280 ++++++++-- .../StencilAccessor.h | 339 ++++++++++++ .../StencilAccessor.md | 506 ++++++++++++++++++ simd_test/Simd.h | 4 +- 7 files changed, 1423 insertions(+), 58 deletions(-) create mode 100644 CLAUDE.md create mode 100644 nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilAccessor.h create mode 100644 nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilAccessor.md diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000000..7dc163fd70 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,119 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Build System + +OpenVDB uses CMake (minimum 3.24) and requires out-of-source builds. + +**Minimal core build:** +```bash +mkdir build && cd build +cmake .. -DOPENVDB_BUILD_UNITTESTS=ON +make -j$(nproc) +``` + +**Using the CI build script (recommended for full builds):** +```bash +./ci/build.sh --build-type=Release \ + --components="core,test" \ + --cargs="-DOPENVDB_ABI_VERSION_NUMBER=13" +``` + +Component flags for `--components`: `core`, `python`, `bin`, `view`, `render`, `test`, `hou`, `axcore`, `nano`, `nanotest` + +**Key CMake options:** +| Option | Default | Description | +|--------|---------|-------------| +| `OPENVDB_BUILD_CORE` | ON | Core library | +| `OPENVDB_BUILD_UNITTESTS` | OFF | Unit tests | +| `OPENVDB_BUILD_NANOVDB` | OFF | NanoVDB | +| `OPENVDB_BUILD_AX` | OFF | OpenVDB AX | +| `OPENVDB_ABI_VERSION_NUMBER` | 13 | ABI version (6–13) | +| `OPENVDB_CXX_STRICT` | OFF | Strict warnings | +| `NANOVDB_USE_CUDA` | OFF | CUDA support for NanoVDB | + +## Running Tests + +```bash +cd build +ctest -V # all tests +ctest -V -R TestGrid # single test by name +``` + +To build only specific unit tests (avoids full rebuild): +```bash +cmake .. -DOPENVDB_TESTS="Grid;Tree;LeafNode" +``` + +Tests use Google Test (minimum 1.10). Test sources live in: +- `openvdb/openvdb/unittest/` — core library tests (`TestFoo.cc` pattern) +- `nanovdb/nanovdb/unittest/` — NanoVDB tests + +## Code Architecture + +### Repository Layout + +``` +openvdb/openvdb/ Core OpenVDB library + tree/ Tree node hierarchy (RootNode, InternalNode, LeafNode) + tools/ Algorithm implementations (level sets, CSG, smoothing, etc.) + math/ Math primitives (Vec, Mat, Quat, Transform, BBox) + io/ VDB file format I/O + points/ Point data grids + python/ Python bindings (nanobind) + unittest/ Unit tests + +nanovdb/nanovdb/ NanoVDB — compact, GPU-friendly VDB subset + tools/ CPU algorithms + tools/cuda/ CUDA kernels + examples/ Standalone example programs + +openvdb_ax/openvdb_ax/ OpenVDB AX — JIT expression language for VDB operations + ast/ Abstract syntax tree + codegen/ LLVM code generation + compiler/ Compilation pipeline + +openvdb_cmd/ Command-line tools (vdb_print, vdb_lod, vdb_tool, vdb_view, vdb_render) +openvdb_houdini/ Houdini plugin +openvdb_maya/ Maya plugin +cmake/ CMake find-modules and configuration +ci/ CI build/install scripts +``` + +### Core Data Model + +OpenVDB uses a **B+tree-like hierarchical sparse data structure**: +- `Grid` — top-level container with transform and metadata +- `Tree` — composed of `RootNode → InternalNode(s) → LeafNode` +- Leaf nodes are 8×8×8 voxel blocks; internal nodes are 16³ and 32³ by default +- `ValueAccessor` caches tree traversal paths for repeated access patterns +- `GridBase` / `TypedGrid` provide the runtime-polymorphic/compile-time-typed split + +### NanoVDB vs OpenVDB + +NanoVDB is a read-optimized, single-allocation, GPU-portable subset of OpenVDB. It cannot be modified after construction. The `nanovdb/tools/CreateNanoGrid.h` and adjacent files handle conversion from OpenVDB grids to NanoVDB grids. The current branch (`mesh-to-grid`) adds `tools/cuda/MeshToGrid.cuh` for direct CUDA mesh-to-NanoVDB conversion. + +### OpenVDB AX + +AX compiles a domain-specific expression language to LLVM IR for execution over OpenVDB volumes and point grids. The pipeline is: source string → AST (`ast/`) → typed analysis → LLVM codegen (`codegen/`) → JIT execution via `compiler/`. + +## C++ Standard and ABI + +- Requires C++17 minimum +- ABI version is set at compile time via `OPENVDB_ABI_VERSION_NUMBER`; the current version is 13 +- Headers are in `openvdb/openvdb/` and installed to `include/openvdb/` + +## Dependencies + +Core: Boost ≥ 1.82, TBB ≥ 2020.3, Blosc ≥ 1.17, OpenEXR/Imath ≥ 3.2, zlib ≥ 1.2.7 +Tests: GTest ≥ 1.10 +Python bindings: Python ≥ 3.11, nanobind ≥ 2.5.0 +NanoVDB GPU: CUDA toolkit +AX: LLVM + +On Linux, ASWF Docker containers (used by CI) bundle most dependencies. See `ci/install_macos.sh` and `ci/install_windows.ps1` for platform-specific setup. + +## Coding Standards + +Follow the style guide at https://www.openvdb.org/documentation/doxygen/codingStyle.html. Contributions require a Developer Certificate of Origin sign-off (`git commit -s`) and a CLA on file — see CONTRIBUTING.md. diff --git a/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp b/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp index a359e069fc..1fda6c31c1 100644 --- a/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp +++ b/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp @@ -35,7 +35,10 @@ #include #include #include // SimdMask, Simd, any_of, none_of, to_bitmask -#include "../ex_voxelBlockManager_host_cuda/BatchAccessor.h" // BatchAccessor +#include "../ex_voxelBlockManager_host_cuda/BatchAccessor.h" // BatchAccessor +#include "../ex_voxelBlockManager_host_cuda/StencilAccessor.h" // StencilAccessor, Weno5Stencil + +#include // __rdtsc, __rdtscp, _mm_lfence #include #include @@ -71,6 +74,11 @@ using BAccT = nanovdb::BatchAccessor, // VoxelOffsetT LaneMask>; // PredicateT +// StencilAccessor instantiation for WENO5. +using SAccT = nanovdb::StencilAccessor; +// Return type of StencilAccessor::moveTo (mask over the uint64_t index domain). +using IndexMaskT = nanovdb::util::SimdMask; + // Direction bit encoding shared across all stencil types: // bit(dx, dy, dz) = (dx+1)*9 + (dy+1)*3 + (dz+1), dx,dy,dz ∈ {-1,0,+1} // @@ -622,6 +630,105 @@ static void verifyBatchAccessor( checkOneTap< 0, 0,+3>(batchAcc, voVec, leafMask, centerLeafOrigin, firstLeaf, currentLeafID, voxelOffset, batchStart, refAcc, stats); } +// ============================================================ +// StencilAccessor correctness verification +// +// For every active lane (set in activeMask returned by moveTo): +// - Reconstruct the global coordinate from (leafIndex, voxelOffset). +// - For each of the 18 WENO5 taps, add the tap offset, decompose into +// leaf-local coordinates, probe the neighbor leaf, and compare +// stencilAcc[k][lane] against refLeaf->getValue(localOffset). +// +// For every inactive lane: +// - Assert that all tap slots hold 0 (the NanoVDB background index). +// ============================================================ + +static void verifyStencilAccessor( + const SAccT& stencilAcc, + IndexMaskT activeMask, // returned by stencilAcc.moveTo() + const uint32_t* leafIndex, + const uint16_t* voxelOffset, + int batchStart, + const LeafT* firstLeaf, + AccT& refAcc, + VerifyStats& stats) +{ + // Check inactive lanes: all tap slots must hold 0 (background index). + for (int i = 0; i < SIMDw; ++i) { + if (activeMask[i]) continue; + for (int k = 0; k < stencilAcc.size(); ++k) { + ++stats.laneChecks; + const uint64_t got = static_cast(stencilAcc[k][i]); + if (got != 0) { + ++stats.errors; + if (stats.errors <= 10) + std::cerr << "STENCIL inactive lane=" << i + << " tap=" << k + << ": expected 0, got " << got << "\n"; + } + } + } + + // Check active lanes against the scalar tree reference. + for (int i = 0; i < SIMDw; ++i) { + if (!activeMask[i]) continue; + + const int p = batchStart + i; + const uint16_t vo = voxelOffset[p]; + const uint32_t li = leafIndex[p]; + const nanovdb::Coord cOrigin = firstLeaf[li].origin(); + + // Center voxel local coordinates within the leaf. + const int lx = (vo >> 6) & 7; + const int ly = (vo >> 3) & 7; + const int lz = vo & 7; + + for (int k = 0; k < 18; ++k) { + const int axis = kWeno5Offsets[k][0]; + const int delta = kWeno5Offsets[k][1]; + const int di = (axis == 0) ? delta : 0; + const int dj = (axis == 1) ? delta : 0; + const int dk = (axis == 2) ? delta : 0; + + // Tap destination in leaf-local space (may be outside [0,7]). + const int nx = lx + di, ny = ly + dj, nz = lz + dk; + + // Leaf-crossing step (−1, 0, or +1 per axis). + const int dx = (nx < 0) ? -1 : (nx >= 8) ? 1 : 0; + const int dy = (ny < 0) ? -1 : (ny >= 8) ? 1 : 0; + const int dz = (nz < 0) ? -1 : (nz >= 8) ? 1 : 0; + + // Wrapped local coordinates within the target leaf. + const int nx_w = nx - dx * 8; + const int ny_w = ny - dy * 8; + const int nz_w = nz - dz * 8; + const uint32_t offset = uint32_t(nx_w) * 64u + uint32_t(ny_w) * 8u + uint32_t(nz_w); + + // Reference: probe the target leaf and read its value. + const LeafT* refLeaf = (dx == 0 && dy == 0 && dz == 0) + ? &firstLeaf[li] + : refAcc.probeLeaf(cOrigin + nanovdb::Coord(dx * 8, dy * 8, dz * 8)); + + const uint64_t expected = refLeaf + ? static_cast(refLeaf->getValue(offset)) + : uint64_t(0); + const uint64_t actual = static_cast(stencilAcc[k][i]); + + ++stats.laneChecks; + if (actual != expected) { + ++stats.errors; + if (stats.errors <= 10) + std::cerr << "STENCIL MISMATCH" + << " tap=(" << di << "," << dj << "," << dk << ")" + << " slot=" << k + << " lane=" << i + << " expected=" << expected + << " actual=" << actual << "\n"; + } + } + } +} + // ============================================================ // Main prototype: Phase 1 (neighbor leaf resolution) + verification // ============================================================ @@ -676,6 +783,9 @@ static void runPrototype(const GridT* // BatchAccessor: alternate execution path for correctness cross-validation. BAccT batchAcc(*grid, currentLeafID); + // StencilAccessor: constructed once per block, persists across batches. + SAccT stencilAcc(*grid, firstLeafID[bID], (uint32_t)nExtraLeaves); + // Process SIMD batches. for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { @@ -686,6 +796,15 @@ static void runPrototype(const GridT* LaneMask activeMask = (leafSlice != LeafIdxVec(CPUVBM::UnusedLeafIndex)); if (nanovdb::util::none_of(activeMask)) continue; + // StencilAccessor: gather all 18 WENO5 tap indices for this batch. + // moveTo owns the straddling loop internally; call once per batch. + { + const IndexMaskT sActive = + stencilAcc.moveTo(leafIndex + batchStart, voxelOffset + batchStart); + verifyStencilAccessor(stencilAcc, sActive, leafIndex, voxelOffset, + batchStart, firstLeaf, acc, stats); + } + // Track straddle batches for diagnostic output. for (int i = 0; i < SIMDw; i++) { if (activeMask[i] && leafIndex[batchStart + i] != currentLeafID) { @@ -802,6 +921,79 @@ static void runPrototype(const GridT* std::cerr << " FAILED: " << stats.errors << " mismatches\n"; } +// ============================================================ +// Performance measurement: StencilAccessor::moveTo throughput +// +// Two-pass design: first pass warms instruction cache, branch predictor, +// and the leaf data accessed by advance()/prefetch(). Second pass is timed. +// decodeInverseMaps is outside the rdtsc fence — we measure moveTo only. +// +// Reports TSC ticks/batch and TSC ticks/voxel (using BlockWidth as denominator; +// slightly over-counts inactive padding slots but is stable across runs). +// TSC ticks ≈ ns × (nominal_GHz); divide by actual turbo frequency for +// CPU cycles if needed. +// ============================================================ + +static void runPerf(const GridT* grid, + const nanovdb::tools::VoxelBlockManagerHandle& vbmHandle) +{ + const LeafT* firstLeaf = grid->tree().getFirstNode<0>(); + const uint32_t nBlocks = (uint32_t)vbmHandle.blockCount(); + const uint32_t* firstLeafID = vbmHandle.hostFirstLeafID(); + const uint64_t* jumpMap = vbmHandle.hostJumpMap(); + + alignas(64) uint32_t leafIndex[BlockWidth]; + alignas(64) uint16_t voxelOffset[BlockWidth]; + + static constexpr int kBatchesPerBlock = BlockWidth / SIMDw; + + // Shared decode + moveTo loop, run twice (warmup then timed). + uint64_t totalTicks = 0; + + for (int pass = 0; pass < 2; ++pass) { + uint64_t passTicks = 0; + + for (uint32_t bID = 0; bID < nBlocks; ++bID) { + const uint64_t blockFirstOffset = + vbmHandle.firstOffset() + (uint64_t)bID * BlockWidth; + + // Decode is outside the timed region. + CPUVBM::decodeInverseMaps( + grid, firstLeafID[bID], + &jumpMap[(uint64_t)bID * CPUVBM::JumpMapLength], + blockFirstOffset, leafIndex, voxelOffset); + + int nExtraLeaves = 0; + for (int w = 0; w < CPUVBM::JumpMapLength; ++w) + nExtraLeaves += nanovdb::util::countOn( + jumpMap[(uint64_t)bID * CPUVBM::JumpMapLength + w]); + + SAccT stencilAcc(*grid, firstLeafID[bID], (uint32_t)nExtraLeaves); + + _mm_lfence(); + const uint64_t t0 = __rdtsc(); + + for (int b = 0; b < kBatchesPerBlock; ++b) + stencilAcc.moveTo(leafIndex + b * SIMDw, voxelOffset + b * SIMDw); + + uint32_t aux; + const uint64_t t1 = __rdtscp(&aux); + + passTicks += (t1 - t0); + } + + if (pass == 1) totalTicks = passTicks; // only record the warm pass + } + + const uint64_t totalBatches = (uint64_t)nBlocks * kBatchesPerBlock; + const uint64_t totalVoxels = (uint64_t)nBlocks * BlockWidth; + + std::printf("\nStencilAccessor::moveTo throughput (warm pass, %u blocks):\n", nBlocks); + std::printf(" total TSC ticks : %lu\n", totalTicks); + std::printf(" ticks / batch : %.1f\n", double(totalTicks) / double(totalBatches)); + std::printf(" ticks / voxel : %.2f\n", double(totalTicks) / double(totalVoxels)); +} + // ============================================================ // Entry point // ============================================================ @@ -856,6 +1048,7 @@ int main(int argc, char** argv) << " (BlockWidth=" << BlockWidth << ")\n\n"; runPrototype(grid, vbmHandle); + runPerf(grid, vbmHandle); } catch (const std::exception& e) { std::cerr << "Exception: " << e.what() << "\n"; diff --git a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.h b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.h index 3ad9c29efa..9de8933334 100644 --- a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.h +++ b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.h @@ -156,6 +156,15 @@ class BatchAccessor mNeighborLeafIDs[dir(0, 0, 0)] = mCenterLeafID; } + // ------------------------------------------------------------------------- + // centerLeafID -- read the current center leaf ID + // + // Exposed for StencilAccessor::moveTo, which needs it for the + // leafSlice == centerLeafID() comparison in the straddling loop. + // There is no raw setter; advance() is the sole legitimate transition. + // ------------------------------------------------------------------------- + uint32_t centerLeafID() const { return mCenterLeafID; } + // ------------------------------------------------------------------------- // advance -- move to a new center leaf // @@ -400,11 +409,12 @@ class BatchAccessor & VoxelOffsetT(kDirMask); const auto d_i32 = util::simd_cast(d_u16); - // Step 2 -- leaf IDs: gather only for active lanes; inactive lanes keep kNullLeafID. - // valid_u32 is then the combined effective mask (leafMask AND neighbor exists). - LeafIDVecT tapLeafID_u32(kNullLeafID); - util::gather_if(tapLeafID_u32, leafMask, mNeighborLeafIDs, d_i32); - const auto valid_u32 = (tapLeafID_u32 != LeafIDVecT(kNullLeafID)); // SimdMask + // Step 2 -- leaf IDs: unmasked gather (all lanes have d_i32 ∈ [0,26] by + // SWAR invariant, so mNeighborLeafIDs[d_i32[i]] is always a valid access). + // Non-leafMask lanes read the current center leaf's neighbor at direction d, + // which is filtered out by the explicit leafMask AND in valid_u32 below. + const LeafIDVecT tapLeafID_u32 = util::gather(mNeighborLeafIDs, d_i32); + const auto valid_u32 = leafMask & (tapLeafID_u32 != LeafIDVecT(kNullLeafID)); // Step 3 -- stride-scaled gather indices (widened to int64_t, invalid lanes -> 0) // kStride is sizeof(LeafT)/sizeof(uint64_t); the static_assert makes the @@ -417,15 +427,15 @@ class BatchAccessor util::simd_cast_if(tapLeafOffset_i64, valid_u32, tapLeafID_u32); tapLeafOffset_i64 = tapLeafOffset_i64 * Int64VecT(kStride); - // Step 4a -- offsets (mOffset) - LeafDataVecT offsets(0); - util::gather_if(offsets, valid_u32, mOffsetBase, tapLeafOffset_i64); + // Step 4a -- offsets (mOffset): unmasked gather. + // Invalid lanes have tapLeafOffset_i64=0 (from simd_cast_if), reading from + // index 0 (center leaf's data). These lanes are excluded by isActive in Step 7. + const LeafDataVecT offsets = util::gather(mOffsetBase, tapLeafOffset_i64); - // Step 4b -- prefixSums (mPrefixSum packed uint64_t, shift-extract field w) - // Invalid lanes have prefixSums=0 after gather_if; (0>>shift)&511=0 for any shift, - // so the outer valid_u32 guard from before is not needed. - LeafDataVecT prefixSums(0); - util::gather_if(prefixSums, valid_u32, mPrefixBase, tapLeafOffset_i64); + // Step 4b -- prefixSums (mPrefixSum packed uint64_t, shift-extract field w): + // unmasked gather for the same reason as Step 4a. After the shift-extract + // below, invalid-lane values don't matter because isActive filters them in Step 8. + LeafDataVecT prefixSums = util::gather(mPrefixBase, tapLeafOffset_i64); const auto wordIdx_u64 = util::simd_cast(wordIdx_u16); const auto nonzero_w = (wordIdx_u64 != LeafDataVecT(0)); const auto shift = util::where(nonzero_w, (wordIdx_u64 - LeafDataVecT(1)) * LeafDataVecT(9), LeafDataVecT(0)); @@ -434,6 +444,8 @@ class BatchAccessor // Step 5 -- maskWords (valueMask().words()[w]) // mMaskWordBase[leaf_id*kStride + w] == leaf[leaf_id].valueMask().words()[w] // because the mask field is at a fixed offsetof within every LeafT. + // Kept as gather_if (masked) so that invalid lanes get maskWords=0, which + // guarantees isActive=false in Step 7 without needing a cross-width mask AND. const auto wordIdx_i64 = util::simd_cast(wordIdx_u16); const auto mask_idx = tapLeafOffset_i64 + wordIdx_i64; LeafDataVecT maskWords(0); diff --git a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.md b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.md index b63d285513..47a3d0bfbe 100644 --- a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.md +++ b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.md @@ -463,51 +463,154 @@ Step 8 — fill result the scalar `popcnt` instruction, which is not vectorisable. The SWAR tree uses only `vpsrlq` / `vpand` / `vpaddq`, which are all AVX2-native. -### 8f. Assembly codegen — GCC 13 vs Clang 18 - -Platform: x86-64, AVX2, `-O3 -DNDEBUG -march=native -std=c++17`. -Representative instantiation: `cachedGetValue<1,0,0>` (x+1 tap, W=16). - -| Metric | GCC 13 | Clang 18 | -|--------|--------|----------| -| Total instructions | 465 | 535 | -| Vector (ymm) | 237 | 262 | -| `call` | 14 | **1** | -| `vzeroupper` | 13 | **2** | -| Hardware gather instructions | **0** | **14** | -| `gather_if` inlined | No | **Yes** | -| `simd_cast` / `where` inlined | No | **Yes** | -| `popcount` inlined | No | No | - -**GCC** emits every `Simd.h` helper (`gather_if`, `simd_cast`, `simd_cast_if`, `where`, -`popcount`) as an out-of-line call. The `gather_if` body does scalar element-by-element -loads via `vmovq` + `vpinsrq` + `vinserti128` — no hardware gather instructions. Each -out-of-line call forces a `vzeroupper` transition (≈80 cycles on many µarchs), giving 13 -such transitions per `cachedGetValue` call. - -**Clang** inlines everything except `popcount`. With `gather_if` inlined, Clang emits -actual hardware gather instructions: +### 8f. Assembly codegen — compiler × backend × ISA matrix + +Flags: `-O3 -DNDEBUG -std=c++17 -fopenmp-simd -Wno-invalid-offsetof`. +ISA: `-mavx2` (base) or `-march=native` (i9-285K Arrow Lake, AVX2; no AVX-512). +Representative instantiation: `cachedGetValue<-3,0,0>` (x−3 tap, W=16), full Steps 1–8. + +**Backend selection:** Simd.h auto-detects `` via `__has_include`. +`-DNANOVDB_USE_STD_SIMD` is redundant when the header is present. +Use `-DNANOVDB_NO_STD_SIMD` to force the array backend. + +#### `cachedGetValue<-3,0,0>` — instruction counts + +Numbers reflect the **unmasked-gather variant** (Steps 2/4a/4b changed to `gather`; +Step 5 `maskWords` kept as `gather_if`). The `ymm`/`xmm`/`calls`/`vzup`/`vpins` +columns are from the original full measurement; `insns` and `vpgather` are +post-unmasked-gather. `—` = not separately measured. + +| Variant | ISA | insns | ymm | xmm | calls | vzup | vpgather | vpins | +|---------|-----|------:|----:|----:|------:|-----:|---------:|------:| +| GCC 13 + stdx | avx2 | 579 | 393 | 100 | 14 | 13 | 0 | 8 | +| GCC 13 + array | avx2 | 1313 | 605 | 524 | 2 | 3 | 0 | 0 | +| Clang 18 + stdx | avx2 | 828 | 530 | 470 | 1 | 2 | 0 | 62 | +| Clang 18 + array | avx2 | 1231 | 459 | 326 | 2 | 2 | 0 | 0 | +| GCC 13 + stdx | native | 641 | 393 | 100 | 14 | 13 | 0 | 8 | +| GCC 13 + array | native | 1175 | — | — | 0 | 0 | 0 | — | +| Clang 18 + stdx | native | 599 | 568 | 284 | 1 | 2 | **16** | 50 | +| Clang 18 + array | native | 1200 | — | — | — | — | **6** | — | + +`vpgather` breakdown (post-unmasked-gather): +- `clang18-stdx-native`: 4× `vpgatherdd` (Step 2: 16 lanes in 4×4) + 12× `vpgatherqq` (Steps 4a/4b/5: 4-wide ×4 chunks ×3) = 16 total +- `clang18-array-native`: 2× `vpgatherdd` + 4× `vpgatherqq` = 6 total + +#### Before/after delta — unmasked-gather change + +| Variant | ISA | insns before | insns after | Δ | vpgather before | vpgather after | +|---------|-----|------------:|------------:|--:|----------------:|---------------:| +| GCC 13 + stdx | avx2 | 641 | 579 | −62 | 0 | 0 | +| GCC 13 + array | avx2 | 1320 | 1313 | −7 | 0 | 0 | +| Clang 18 + stdx | avx2 | 795 | 828 | +33 | 0 | 0 | +| Clang 18 + array| avx2 | 1365 | 1231 | −134 | 0 | 0 | +| GCC 13 + stdx | native | 641 | 641 | 0 | 0 | 0 | +| GCC 13 + array | native | 1365 | 1175 | −190 | 0 | 0 | +| Clang 18 + stdx | native | 600 | 599 | −1 | 14 | 16 | +| Clang 18 + array| native | 1365 | 1200 | −165 | 0 | 6 | + +The `clang18-stdx-avx2` regression (+33) is expected: the unmasked `gather` path +in the stdx backend emits a slightly different `where`-free code sequence that Clang +does not fold as aggressively as the original `gather_if`. Total instruction count +is still lower than the array backend. + +#### `-mavx2 -mtune=native` equivalence + +On this machine (i9-285K Arrow Lake, no AVX-512), `-march=native` and +`-mavx2 -mtune=native` produce **identical hardware-gather emission** under Clang: + +| Variant | flags | insns | vpgdd | vpgqq | +|---------|-------|------:|------:|------:| +| Clang 18 + stdx | `-mavx2 -mtune=native` | 599 | 4 | 12 | +| Clang 18 + array | `-mavx2 -mtune=native` | 1219 | 2 | 4 | + +The difference between `-mavx2` and `-march=native` is purely the **tuning model**, +not the ISA: +- `-mavx2`: targets `mtune=generic` — conservative gather cost model, no hardware gathers. +- `-march=native` (Clang): implies `mtune=sierraforest` — knows Arrow Lake's gather + throughput, auto-vectorizer considers gathers profitable → emits `vpgatherqq`. +- `-march=native` (GCC): sets the ISA to sierraforest but keeps `mtune=generic` — + same conservative behaviour as `-mavx2`. No hardware gathers emitted by GCC even + with `-march=native`. + +GCC's stdx backend produces identical output (641 insns before / 579 after, 0 gathers) +for both `-mavx2` and `-march=native`. + +#### `prefetch<-3,0,0>` — standalone vs inlined + +| Variant | ISA | standalone symbol? | insns | +|---------|-----|--------------------|------:| +| GCC 13 + stdx | any | No — fully inlined | — | +| GCC 13 + array | avx2 | Yes | 260 | +| Clang 18 + stdx | any | No — fully inlined | — | +| Clang 18 + array| avx2 | Yes | 176 | +--- + +**Finding 1 — stdx backend is far superior to the array backend.** +The array backend is ≈2× larger in instruction count and degrades every `gather_if` +to a scalar lane-by-lane loop: 16 `vpextrw` to extract uint16_t direction indices, 16 +conditional branches, 16 scalar uint32_t loads from `mNeighborLeafIDs`, then repeated +for each of the three uint64_t gathers (48 `vpextrq` total). In the stdx backends, +`gather_if` either maps to hardware gather instructions (Clang + native) or at worst +compact `vpinsrq` sequences (Clang + avx2). The 76 vpextr instructions (array backend) +vs 62 vpinsrb/q (stdx avx2) is telling: array is still scalar-inserting via extract, +not vectorised. The array backend also fails to inline `prefetch`. + +**Finding 2 — Clang inlines all helpers; GCC emits 14 out-of-line weak stubs.** +GCC 13 emits `gather_if`, `simd_cast`, `simd_cast_if`, `where`, and `popcount` as +out-of-line COMDAT weak symbols and calls them. Each call requires `vzeroupper` on +entry (AVX ABI), yielding 13 transitions per `cachedGetValue` invocation. Clang 18 +inlines all of them into a single function body except the final `popcount` call. + +**Finding 3 — Hardware gathers require Clang + native tuning; unmasked gathers unlock the array backend too.** +After the unmasked-gather change, `clang18-stdx-native` emits **16** hardware gathers per `cachedGetValue`: ``` -vpgatherdd — 2× for the uint32_t tapLeafID gather (Step 2, 8-wide × 2 = 16 lanes) -vpgatherqq — 12× for the three uint64_t gathers (Steps 4a, 4b, 5: 4-wide × 4 = 16 lanes each) +vpgatherdd — 4× for the uint32_t tapLeafID gather (Step 2: 4-wide × 4 = 16 lanes) +vpgatherqq — 12× for the three uint64_t data gathers (Steps 4a/4b/5: 4-wide × 4 each) ``` - -Only 2 `vzeroupper` transitions remain (function entry/exit). - -The 43 `vpinsrb` instructions in the Clang output are the mask-format conversion cost -for the heterogeneous `gather_if` mask: `SimdMask` must be widened to -4 × `SimdMask` to drive `vpgatherqq`'s sign-bit mask mechanism. - -**`popcount`** body (out-of-line in both compilers): 88 instructions, 85 ymm. -The SWAR shift-and-add tree is fully vectorized with `vpsrlq`, `vpand`, `vpsubq`, -`vpaddq` — exactly the AVX2-friendly instruction set targeted in §8e. Adding -`[[gnu::always_inline]]` to `util::popcount` in `Simd.h` would fold these 88 -instructions inline and eliminate the last `callq`. - -**Action:** GCC inlining can be forced across the board with `[[gnu::always_inline]]` -on `gather_if`, `simd_cast`, `simd_cast_if`, `where`, and `popcount` in `Simd.h`. -This is a pure-upside change for GCC; Clang already inlines all but `popcount`. +`clang18-array-native` now emits **6** hardware gathers (2 vpgdd + 4 vpgqq) — the first +gathers ever seen in the array backend. The unmasked `for (i) dst[i] = ptr[idx[i]]` +loop is the pattern Clang's auto-vectorizer converts to `vpgatherqq`; the `if (mask[i])` +conditional in `gather_if` defeated auto-vectorization for all mask types. + +GCC 13 emits 0 hardware gathers even with `-march=native` — its stdx backend does not +exploit `vpgatherdd`/`vpgatherqq` for `experimental::simd` gather operations. With +`-mavx2` alone, Clang also falls back to software gather (62 `vpinsrq/b`). + +The 50 `vpinsrb` that remain in `clang18-stdx-native` are the mask-widening cost for +the one remaining heterogeneous `gather_if` (Step 5 `maskWords`): `SimdMask` +is widened to four `SimdMask` chunks to provide the sign-bit masks that +`vpgatherqq` expects. + +**Finding 4 — `-march=native` gains nothing for GCC, in either backend.** +GCC's stdx backend produces identical output (641/579 insns, 0 gathers) for both +`-mavx2` and `-march=native`. The array backend with `-march=native` (1175 insns, +0 gathers) also emits zero hardware gathers — even for the bare unmasked +`for (i) dst[i] = ptr[idx[i]]` loop that Clang converts to `vpgatherqq`. GCC's +auto-vectorizer cost model treats gather instructions as unprofitable regardless of +tuning, preferring 40 `vpextrq` + 16 `vpinsrq` + 65 `vmovq` (scalar lane-by-lane) +instead. This is a GCC backend policy, not a flag or mask-type issue. + +**Finding 5 — Masking was the auto-vectorizer blocker for gathers.** +`gather_if` takes an `if (mask[i]) dst[i] = ptr[idx[i]]` shape — a conditional store. +This defeats Clang's gather auto-vectorizer for every mask element type tried (bool, +uint32_t, uint64_t). The unmasked `gather` loop `dst[i] = ptr[idx[i]]` is the one +pattern that Clang + native tuning converts to `vpgatherqq`. The sentinel invariant +makes the change safe: Step 2 uses `d ∈ [0,26]` (SWAR always valid); Steps 4a/4b use +`tapLeafOffset_i64 = 0` for invalid lanes (reading from base[0], the center leaf — safe +but unused); Step 5 is kept masked so that `maskWords = 0` for invalid lanes, ensuring +`isActive = false` without a cross-width mask AND. + +**`popcount`** (out-of-line in all variants that reach it): 88 instructions, 85 ymm. +Fully vectorised with `vpsrlq`, `vpand`, `vpsubq`, `vpaddq`. Adding +`[[gnu::always_inline]]` to `util::popcount` in Simd.h eliminates the last remaining +out-of-line call in the Clang path and reduces GCC from 14 to 13 external calls. + +**Action — `[[gnu::always_inline]]` on Simd.h helpers:** +Adding `[[gnu::always_inline]]` (or `__attribute__((always_inline))`) to `gather_if`, +`simd_cast`, `simd_cast_if`, `where`, and `popcount` in Simd.h eliminates all 13 +`vzeroupper` transitions under GCC. Clang already inlines all but `popcount`; the +attribute is safe and a no-op for Clang. **`popcount` alternative — `vpshufb`-based nibble popcount:** The current SWAR shift-and-add tree (88 instructions, §8e) avoids the scalar `popcnt` @@ -537,6 +640,88 @@ arithmetic-heavy SWAR ports — so the `vpshufb` path is also more friendly to out-of-order overlap with surrounding code. This is the standard compiler-generated AVX2 popcount pattern and the likely replacement for `popcount64` in `Simd.h`. +### 8g. Cycle budget and architectural comparison + +#### `cachedGetValue` critical path (Clang 18 + stdx + `-march=native`, W=16) + +| Step | Work | Cumulative cycles | +|------|------|------------------:| +| 1 | SWAR expansion + base-32 multiply → `d_vec` | ~8 | +| 2 | 4× `vpgatherdd` → `tapLeafID_u32` | ~20 | +| 3 | `simd_cast_if` + ×kStride → `tapLeafOffset_i64` | ~25 | +| 4a/4b/5 | 4+4+4 `vpgatherqq` (3 independent groups, overlap in OoO) | ~41 | +| 6–8 | bitwise `dest_yz`, `maskWords & voxelBit`, popcount SWAR + `where` | **~55** | + +Critical path per call: **~55 cycles** (gather-chain limited; Steps 4a/4b/5 are the +deepest dependency). + +Single-core throughput reality: each call is ~600 instructions. Arrow Lake's ROB +(~500 entries) holds less than one full call, so call-to-call OoO overlap is minimal. +Realistic single-core cost is **~80–100 cy/call**, not the ~7 cy/call that perfect 8× +OoO would imply. For 128 elements × 18 taps = 144 calls: **~12,000–14,000 cycles +single-threaded**, or **~100 cy/element**. + +#### Comparison with scalar NanoVDB `getValue(ijk)` + +Naive alternative: 128 voxels × 18 taps = 2304 scalar `ReadAccessor::getValue()` calls. + +| Accessor L0 cache behaviour | cy/call | 2304 calls | cy/element | +|-----------------------------|--------:|-----------:|-----------:| +| Hit (same leaf as last call) | ~22 | ~51,000 | ~400 | +| Miss, tree nodes L1-warm | ~52 | ~120,000 | ~940 | +| Miss, tree nodes cold | ~100+ | ~230,000 | ~1800 | + +**BatchAccessor speedup: 4–10× depending on hit rate.** + +The two sources of gain: + +1. **Amortised tree traversal (dominant).** `prefetch` calls `probeLeaf` at most once + per direction per center-leaf switch — **12 calls** for a 128-element block (6 + directions × 2 center-leaf switches) vs. up to 2304 traversals for the scalar path. + Each saved traversal is ~25–35 cycles of pointer-chasing through root → internal → + internal → leaf with warm L1 nodes. + +2. **SIMD × 16.** The SWAR expansion, gather chain, and popcount all execute once for + 16 lanes simultaneously. Even if the scalar accessor hit perfectly on every call, + the SIMD path still wins by ~4× on arithmetic work alone. + +The scalar hit rate depends on loop ordering. Processing all 18 taps for one voxel +before moving to the next evicts the cached leaf on nearly every tap switch (high miss +rate). Sweeping all 128 voxels for one tap at a time improves hit rate, but requires +18 passes over the voxel array and hurts reuse of stencil results. + +#### CPU vs GPU: why the same operation inverts + +On CPU (8 P-cores), the 128-element block is **compute-bound**: + +- Index computation: ~12,000 cy per core +- Value fetch (512 unique floats, 32 cache lines, 8 cores competing for DDR5-5600): + ~80–664 cycles depending on cache level and core count +- System DRAM bandwidth consumed at full parallelism: ~4.6 GB/s out of 89 GB/s + available (~5% utilisation) + +The gather chain latency is the bottleneck; bandwidth sits largely idle. The CPU +BatchAccessor design (SIMD W=16, hardware `vpgatherqq`) directly attacks this by +compressing 16 serial gather chains into one parallel 55-cycle critical path. + +On GPU the same operation becomes **bandwidth-bound**: + +- An SM has hundreds of warps in flight. When a warp stalls on a gather or arithmetic + latency (~20–100+ cycles), the scheduler switches to another ready warp instantly. + The entire index computation — SWAR, base-32 multiply, all gather latencies — is + absorbed by warp switching. Effective compute cost per thread: ~0 stall cycles. +- What remains visible to the GPU is the **global memory traffic**: fetching stencil + float values. With hundreds of SMs each issuing many transactions simultaneously, + HBM bandwidth saturates quickly. +- GPU gathers are scalar-per-thread: 32 threads in a warp each doing an 8-byte load = + 32 independent transactions. Non-contiguous addresses (stencil neighbours across + leaves) yield uncoalesced access, amplifying bandwidth pressure. + +Consequently, GPU optimisation for this workload targets **coalescing** (adjacent +threads access adjacent values) and **cache footprint** (keeping the neighbour-leaf +working set in L1/shared memory), rather than the gather-chain depth that dominates +on CPU. + --- ## 9. Relationship to Phase 1 Prototype @@ -563,6 +748,17 @@ AVX2 popcount pattern and the likely replacement for `popcount64` in `Simd.h`. Verified against scalar reference over 12M lane-checks across all 18 WENO5 taps. - Class-level base pointers (`mOffsetBase`, `mPrefixBase`, `mMaskWordBase`). - `simd_cast_if`, heterogeneous `gather_if`, `popcount64`/`popcount` added to `Simd.h`. +- Simd.h array-backend `Simd(const T*, element_aligned_tag)` load constructor: + removed default argument for the tag to eliminate the `Simd(0)` null-pointer-constant + ambiguity that breaks compilation under `-DNANOVDB_NO_STD_SIMD`. +- Full 7-variant codegen analysis (compiler × backend × ISA, §8f), including + before/after delta for the unmasked-gather change and `-mavx2 -mtune=native` + equivalence finding. +- **Unmasked gather (Steps 2/4a/4b):** `gather_if` replaced with `gather` using the + sentinel invariant (d ∈ [0,26]; invalid lanes read base[0]). Step 5 kept masked so + `maskWords=0` for invalid lanes → `isActive=false` without cross-width mask AND. + Verified: 12M lane-checks pass across all 18 WENO5 taps. Unlocks hardware + `vpgatherqq` in the array backend under Clang + native tuning. ### Remaining diff --git a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilAccessor.h b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilAccessor.h new file mode 100644 index 0000000000..2e4bd84a0f --- /dev/null +++ b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilAccessor.h @@ -0,0 +1,339 @@ +// Copyright Contributors to the OpenVDB Project +// SPDX-License-Identifier: Apache-2.0 + +/*! + \file StencilAccessor.h + + \brief SIMD stencil-index gatherer built on BatchAccessor. + + Wraps a BatchAccessor and owns the straddling loop, prefetch-hull + sequencing, and per-tap cachedGetValue calls for one VBM block. + Its output is a fixed-size array of Simd — one vector + per stencil tap — containing ValueOnIndex indices for all W lanes. + + Design documented in: + nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilAccessor.md + + Template parameters + ------------------- + BuildT NanoVDB build type (e.g. ValueOnIndex). + W SIMD lane width. + StencilT Policy class describing the stencil. Must expose: + using Taps = std::tuple...>; + using Hull = std::tuple...>; + UnusedLeafIndex + Sentinel written by decodeInverseMaps for padding slots. + Defaults to ~uint32_t(0) (VoxelBlockManagerBase::UnusedLeafIndex). + + Usage + ----- + Construct once per VBM block; call moveTo() for each SIMD batch. + See StencilAccessor.md §10 for the caller pattern. +*/ + +#pragma once + +#include +#include +#include "BatchAccessor.h" + +#include +#include +#include +#include +#include // std::index_sequence, std::make_index_sequence + +namespace nanovdb { + +// ============================================================================= +// StencilPoint — compile-time stencil tap offset +// ============================================================================= + +/// Compile-time 3D offset used as a type (not a value) in StencilT::Taps +/// and StencilT::Hull tuples. +template +struct StencilPoint { + static constexpr int di = DI; + static constexpr int dj = DJ; + static constexpr int dk = DK; +}; + +// ============================================================================= +// findIndex — compile-time inverse map: (DI,DJ,DK) → slot index in a Taps tuple +// ============================================================================= + +namespace detail { + +/// Returns the first index Is in [0,N) where tuple_element_t +/// matches (DI,DJ,DK), or -1 if not found. +template +constexpr int findIndex(std::index_sequence) +{ + int result = -1; + // Fold: for each Is, if the tap matches and we haven't found one yet, record it. + ((std::tuple_element_t::di == DI && + std::tuple_element_t::dj == DJ && + std::tuple_element_t::dk == DK && + result < 0 + ? (result = int(Is)) : 0), ...); + return result; +} + +} // namespace detail + +// ============================================================================= +// Weno5Stencil — 18-tap axis-aligned WENO5 stencil, radius 3 +// ============================================================================= + +/// Concrete StencilT for the WENO5 3D stencil. +/// Taps: 18 axis-aligned offsets in {±1,±2,±3} × {x,y,z}. +/// Hull: 6 extremal offsets that cover all 18 tap crossing directions. +struct Weno5Stencil { + using Taps = std::tuple< + // x-axis + StencilPoint<-3, 0, 0>, StencilPoint<-2, 0, 0>, StencilPoint<-1, 0, 0>, + StencilPoint<+1, 0, 0>, StencilPoint<+2, 0, 0>, StencilPoint<+3, 0, 0>, + // y-axis + StencilPoint< 0,-3, 0>, StencilPoint< 0,-2, 0>, StencilPoint< 0,-1, 0>, + StencilPoint< 0,+1, 0>, StencilPoint< 0,+2, 0>, StencilPoint< 0,+3, 0>, + // z-axis + StencilPoint< 0, 0,-3>, StencilPoint< 0, 0,-2>, StencilPoint< 0, 0,-1>, + StencilPoint< 0, 0,+1>, StencilPoint< 0, 0,+2>, StencilPoint< 0, 0,+3> + >; + // Hull = 6 extremal taps that collectively probe all reachable face-neighbor + // directions for any combination of voxel position and WENO5 tap. + // See StencilAccessor.md §4b for the monotonicity argument. + using Hull = std::tuple< + StencilPoint<-3, 0, 0>, StencilPoint<+3, 0, 0>, + StencilPoint< 0,-3, 0>, StencilPoint< 0,+3, 0>, + StencilPoint< 0, 0,-3>, StencilPoint< 0, 0,+3> + >; +}; + +// ============================================================================= +// StencilAccessor +// ============================================================================= + +template +class StencilAccessor +{ + using GridT = NanoGrid; + + // ------------------------------------------------------------------------- + // Type aliases — scalar/SIMD split (§5 of design doc) + // ------------------------------------------------------------------------- + + // Output index type: one Simd per tap. + using IndexVec = std::conditional_t>; + + // Voxel offset type: loaded from the voxelOffset[] array (uint16_t). + using OffsetVec = std::conditional_t>; + + // Leaf index type: loaded from the leafIndex[] array (uint32_t). + using LeafIdVec = std::conditional_t>; + + // Internal mask — derived from leafIndex[] comparisons (uint32_t domain). + // Passed to BatchAccessor::prefetch / cachedGetValue. + using LeafMaskVec = std::conditional_t>; + + // External mask — returned by moveTo; semantically over mIndices (uint64_t). + // Both LeafMaskVec and IndexMaskVec are W-bit masks; conversion is a + // boolean round-trip (see SimdMask converting constructor in Simd.h). + using IndexMaskVec = std::conditional_t>; + + // BatchAccessor parameterised with LeafMaskVec (prefetch/cachedGetValue domain). + using BatchAcc = std::conditional_t, + BatchAccessor>; + + static constexpr int SIZE = int(std::tuple_size_v); + static constexpr int HULL_SIZE = int(std::tuple_size_v); + +public: + // ------------------------------------------------------------------------- + // Construction + // + // firstLeafID -- VBM block's starting leaf ID (vbm.hostFirstLeafID()[blockID]). + // nExtraLeaves -- number of distinct center-leaf advances possible in this block + // (computed by the caller from the jumpMap). Used only as a + // debug-mode assert bound; not needed for correctness. + // See StencilAccessor.md §7 for removal instructions. + // ------------------------------------------------------------------------- + StencilAccessor(const GridT& grid, uint32_t firstLeafID, uint32_t nExtraLeaves) + : mBatch(grid, firstLeafID) +#ifndef NDEBUG + , mNExtraLeaves(nExtraLeaves) +#endif + { + (void)nExtraLeaves; // suppress unused-parameter warning in release builds + } + + // ------------------------------------------------------------------------- + // moveTo -- gather all tap indices for a W-wide batch of center voxels + // + // leafIndex -- ptr to leafIndex[batchStart] (uint32_t array from decodeInverseMaps) + // voxelOffset -- ptr to voxelOffset[batchStart] (uint16_t array from decodeInverseMaps) + // + // Returns the initial active-lane mask (leafSlice != UnusedLeafIndex), widened + // to IndexMaskVec. Active lanes have valid results in mIndices[0..SIZE-1]. + // Inactive lanes hold 0 (NanoVDB background index). + // + // See StencilAccessor.md §8 for the full straddling loop design. + // ------------------------------------------------------------------------- + IndexMaskVec moveTo(const uint32_t* leafIndex, const uint16_t* voxelOffset) + { + // Zero all tap slots — inactive lanes will hold 0 (background index). + zeroIndices(std::make_index_sequence{}); + + // Load this batch. + const LeafIdVec leafSlice = loadLeafIdVec(leafIndex); + const OffsetVec voVec = loadOffsetVec(voxelOffset); + + // Initial active-lane mask (which lanes have real voxels). + LeafMaskVec activeMask = (leafSlice != LeafIdVec(UnusedLeafIndex)); + + // Save before the drain loop — this is what we return. + const IndexMaskVec resultMask = widenMask(activeMask); + + if (util::none_of(activeMask)) return resultMask; + +#ifndef NDEBUG + uint32_t nAdvances = 0; +#endif + + // Straddling loop: consume one center leaf's worth of lanes per iteration. + while (util::any_of(activeMask)) { + const LeafMaskVec leafMask = + activeMask & (leafSlice == LeafIdVec(mBatch.centerLeafID())); + + if (util::none_of(leafMask)) { + // No lanes for this leaf — advance to next. + mBatch.advance(mBatch.centerLeafID() + 1); +#ifndef NDEBUG + assert(++nAdvances <= mNExtraLeaves); +#endif + continue; + } + + // Prefetch hull — warms all neighbor-leaf directions the full + // stencil can reach, before any cachedGetValue is called. + prefetchHull(voVec, leafMask, std::make_index_sequence{}); + + // Compute all tap indices and blend into mIndices. + calcTaps(voVec, leafMask, std::make_index_sequence{}); + + // Remove processed lanes. + activeMask = activeMask & !leafMask; + } + + return resultMask; + } + + // ------------------------------------------------------------------------- + // getValue -- access tap result by compile-time coordinate + // + // Resolved entirely at compile time via the findIndex constexpr fold. + // Returns a const reference valid until the next moveTo() call. + // ------------------------------------------------------------------------- + template + const IndexVec& getValue() const + { + constexpr int I = detail::findIndex( + std::make_index_sequence{}); + static_assert(I >= 0, "StencilAccessor::getValue: tap not in stencil"); + return mIndices[I]; + } + + // ------------------------------------------------------------------------- + // operator[] -- indexed tap access (for generic iteration over all taps) + // + // No bounds check in release. Same lifetime as getValue. + // ------------------------------------------------------------------------- + const IndexVec& operator[](int i) const { return mIndices[i]; } + + static constexpr int size() { return SIZE; } + +private: + // ------------------------------------------------------------------------- + // Private helpers + // ------------------------------------------------------------------------- + + // Compile-time zero of all SIZE index slots. + template + void zeroIndices(std::index_sequence) + { + ((mIndices[Is] = IndexVec(uint64_t(0))), ...); + } + + // Load LeafIdVec from a uint32_t pointer (scalar or SIMD). + static LeafIdVec loadLeafIdVec(const uint32_t* p) + { + if constexpr (W == 1) return *p; + else return LeafIdVec(p, util::element_aligned); + } + + // Load OffsetVec from a uint16_t pointer (scalar or SIMD). + static OffsetVec loadOffsetVec(const uint16_t* p) + { + if constexpr (W == 1) return *p; + else return OffsetVec(p, util::element_aligned); + } + + // Widen LeafMaskVec (uint32_t domain) → IndexMaskVec (uint64_t domain). + // Both are W-bit masks; SimdMask has a converting constructor from + // SimdMask that copies the bool array element-by-element (Simd.h §B). + // The stdx backend uses a boolean round-trip (WhereExpression, Simd.h §A). + static IndexMaskVec widenMask(LeafMaskVec m) + { + if constexpr (W == 1) return m; + else return IndexMaskVec(m); + } + + // Compile-time fold: prefetch all HULL_SIZE hull directions. + template + void prefetchHull(OffsetVec voVec, LeafMaskVec leafMask, std::index_sequence) + { + using Hull = typename StencilT::Hull; + (mBatch.template prefetch< + std::tuple_element_t::di, + std::tuple_element_t::dj, + std::tuple_element_t::dk + >(voVec, leafMask), ...); + } + + // Compile-time fold: cachedGetValue for all SIZE taps, where-blend into mIndices. + template + void calcTaps(OffsetVec voVec, LeafMaskVec leafMask, std::index_sequence) + { + (blendOneTap(voVec, leafMask), ...); + } + + // Fetch one tap and blend its result into mIndices[I] for the active lanes. + // The where(leafMask, mIndices[I]) = tmp blend uses the heterogeneous + // where() overload from Simd.h: LeafMaskVec (uint32_t) applied to + // IndexVec (uint64_t). Both are W-bit masks; Simd.h handles the conversion. + template + void blendOneTap(OffsetVec voVec, LeafMaskVec leafMask) + { + using P = std::tuple_element_t; + IndexVec tmp(uint64_t(0)); + mBatch.template cachedGetValue(tmp, voVec, leafMask); + util::where(leafMask, mIndices[I]) = tmp; + } + + // ------------------------------------------------------------------------- + // Members + // ------------------------------------------------------------------------- + + BatchAcc mBatch; // owns neighbor-leaf cache, mCenterLeafID + IndexVec mIndices[SIZE]; // one vector per tap — output store + +#ifndef NDEBUG + uint32_t mNExtraLeaves; // removable sanity bound on center-leaf advances +#endif +}; + +} // namespace nanovdb diff --git a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilAccessor.md b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilAccessor.md new file mode 100644 index 0000000000..2f6805026b --- /dev/null +++ b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilAccessor.md @@ -0,0 +1,506 @@ +# StencilAccessor — Design Plan + +Higher-level wrapper around `BatchAccessor` that owns the straddling loop, +fills complete stencil result arrays, and presents a clean per-block API to +the WENO (or other stencil) kernel. + +--- + +## 1. Purpose + +`StencilAccessor` wraps `BatchAccessor` and owns the full stencil evaluation +for one SIMD-wide batch of voxels. Its output is a fixed-size array of +`Simd` — one vector per tap — containing the ValueOnIndex indices +for all W lanes simultaneously. The caller uses these indices to fetch sidecar +data (floats, etc.) independently; no value arrays are read here. + +``` +input: W voxel offsets + W-wide active mask + center-leaf context +output: Simd × N_taps +``` + +This separates index gathering (StencilAccessor) from value fetching (caller), +which is the right split: index gathering is the expensive irregular part; +value fetching is a straight gather from a dense sidecar array that the caller +can pipeline, prefetch, or vectorise independently. + +--- + +## 2. Relationship to BatchAccessor + +`BatchAccessor::cachedGetValue` produces one `Simd` for +one tap. `StencilAccessor` calls it for every tap in the stencil and assembles +the result array. It also owns: + +- calling `prefetch` for every direction that the batch may cross into +- the **straddling loop**: the `while (any_of(leafMask))` structure that handles + lanes whose center leaf differs from the majority and must be processed + separately before rejoining the batch + +--- + +## 3. Why StencilAccessor must own the stencil — the cache invariant + +`BatchAccessor`'s neighbor cache (`mNeighborLeafIDs[27]`, `mProbedMask`) is valid +only for the current center leaf. Advancing the center leaf invalidates the cache. + +This creates a hard ordering constraint: **all taps must be computed for a given +center leaf before the center leaf advances.** + +If the caller drove the tap loop and called `cachedGetValue` one tap at +a time, it could inadvertently interleave taps across a center-leaf transition, +producing silently wrong results (stale neighbor IDs). + +`StencilAccessor` avoids this by: +1. Holding the complete tap list at compile time. +2. Owning the center-leaf advancement loop. +3. For each center leaf: calling `prefetch` for all needed directions, then + `cachedGetValue` for all taps, before advancing. + +The straddling case makes this constraint sharper: when some lanes cross into a new +center leaf mid-batch, `StencilAccessor` peels those lanes off, runs the **full +stencil** for the new center leaf on the peeled subset, then recombines — all before +yielding the complete result array to the caller. This is only possible because +the full tap list is known upfront. + +--- + +## 4. Compile-time stencil description — `StencilT` + +The stencil is encoded in a `StencilT` policy class passed as a template argument +to `StencilAccessor`. It carries two compile-time sets: + +### 4a. Tap set + +An ordered, sized list of `(di, dj, dk)` offsets. `SIZE` determines the number +of output `Simd` vectors; the index of each tap in the list is its +slot in the output array, so the caller knows which slot corresponds to which offset. + +### 4b. Prefetch hull + +A list of **actual tap offsets** — not normalized `{-1,0,1}³` leaf directions — +that `StencilAccessor` calls `prefetch` on before evaluating any tap. + +The hull is the **minimal set of extreme taps** such that prefetching them +guarantees every `cachedGetValue` call for every stencil tap will find its +neighbor leaf already cached. + +**Why extreme taps suffice — the monotonicity argument:** + +`prefetch` computes, for each lane, which neighbor-leaf direction it +crosses into (encoded as the carry triple `(cx,cy,cz) ∈ {under,in,over}³` from +the SWAR expansion). A crossing in the −x direction occurs when `x + di < 0`, +i.e., when `x < |di|`. For a more extreme tap `hi` with `|hi| ≥ |di|` and the +same sign, `x < |di| ⟹ x < |hi|` — so any lane that the intermediate tap would +cause to cross is **also** detected by the extreme tap. The converse is not true +(the extreme tap may probe a neighbor that the intermediate tap would not reach), +but that is safe: a conservative probe wastes at most one `probeLeaf` call with +no correctness impact. + +**WENO5 (axis-aligned taps, radius 3):** +Lanes can never simultaneously cross two axis boundaries for a single tap, so +edge and corner leaf neighbors are unreachable. The 6 axis-extremal taps are +sufficient: + +``` +hull = { {-3,0,0}, {3,0,0}, + {0,-3,0}, {0,3,0}, + {0,0,-3}, {0,0,3} } +``` + +**3×3×3 box stencil (includes diagonal taps):** +A lane at `(x=0, y=0, z=0)` with tap `(-1,-1,-1)` crosses all three axes at +once, reaching the `(-1,-1,-1)` corner leaf neighbor. The 8 corner taps +`{(±1,±1,±1)}` form the hull: each corner tap, across all lane positions, +generates crossings in every combination of axes within its sign octant, +covering all 26 neighbor directions (faces, edges, and corners). + +**General rule:** the hull = the **sign-octant convex hull vertices** of the +tap set. For axis-aligned stencils these are the axis extremes; for stencils +with diagonal taps these are the corners of the tap set's bounding box in each +octant. + +The hull is **provided explicitly** rather than derived automatically — it is a +one-time design-time decision per stencil type, and it avoids compile-time logic +that would need to reason about leaf size vs. tap radius. + +### 4c. Sketch of `StencilT` concept + +```cpp +// WENO5 3D stencil: 18 axis-aligned taps, radius 3, hull = 6 extremal taps +struct Weno5Stencil { + static constexpr int SIZE = 18; + + // ordered tap list: output slot i ↔ taps[i] + static constexpr nanovdb::Coord taps[SIZE] = { + {-3,0,0}, {-2,0,0}, {-1,0,0}, {1,0,0}, {2,0,0}, {3,0,0}, + {0,-3,0}, {0,-2,0}, {0,-1,0}, {0,1,0}, {0,2,0}, {0,3,0}, + {0,0,-3}, {0,0,-2}, {0,0,-1}, {0,0,1}, {0,0,2}, {0,0,3}, + }; + + // prefetch hull: 6 extremal taps cover all 18 + static constexpr int HULL_SIZE = 6; + static constexpr nanovdb::Coord hull[HULL_SIZE] = { + {-3,0,0}, {3,0,0}, + {0,-3,0}, {0,3,0}, + {0,0,-3}, {0,0,3}, + }; +}; +``` + +The exact representation (constexpr arrays, parameter packs, index sequences) is +to be refined. The conceptual contract is fixed: `StencilT` exposes `SIZE`, +an indexed tap list, `HULL_SIZE`, and an indexed hull list — all at compile time. + +--- + +## 5. Template parameters and type aliases + +```cpp +template +class StencilAccessor { + + // Scalar/SIMD split — explicit conditional, not Simd degeneracy. + // Matches the convention BatchAccessor already uses for its own template params. + using IndexVec = std::conditional_t>; + using OffsetVec = std::conditional_t>; + using LeafIdVec = std::conditional_t>; + + // Two distinct mask types — they differ in element width and in role: + // + // LeafMaskVec — mask over leafIndex[] (uint32_t) comparisons. + // Used internally in the straddling loop and passed to + // BatchAccessor::prefetch / cachedGetValue. + // + // IndexMaskVec — mask over mIndices[] (uint64_t) values. + // Returned by moveTo so the caller can gate reads from + // Simd stencil result vectors. + // + // In the underlying bitmask representation both are W-bit masks; the type + // distinction exists for semantic correctness when blending or gating on + // 64-bit vs 32-bit SIMD data. A widening reinterpret is needed when + // converting the initial LeafMaskVec activeMask to the IndexMaskVec return. + using LeafMaskVec = std::conditional_t>; + using IndexMaskVec = std::conditional_t>; + + // BatchAccessor is parameterised with LeafMaskVec because prefetch() and + // cachedGetValue() operate in the leaf-ID (uint32_t) domain. + using BatchAcc = std::conditional_t, + BatchAccessor>; + + static constexpr int SIZE = std::tuple_size_v; + static constexpr int HULL_SIZE = std::tuple_size_v; +}; +``` + +W=1 gives a fully scalar `BatchAccessor` underneath with plain scalar `mIndices` — +a clean debug and cross-validation path identical in logic to the SIMD path. + +--- + +## 6. Internal state + +```cpp +BatchAcc mBatch; // owns neighbor-leaf cache, mCenterLeafID, and cachedGetValue +IndexVec mIndices[SIZE]; // one SIMD vector (or scalar) per tap — output store +``` + +**`mBatch`** — the embedded `BatchAccessor`. It is the **single source of truth** +for the current center leaf ID. `BatchAccessor` exposes a `centerLeafID()` getter +so `StencilAccessor::moveTo` can read it for the `leafSlice == currentLeafID` +comparison without maintaining a redundant copy. `StencilAccessor` drives +advancement by calling `mBatch.advance(newLeafID)`. + +`StencilAccessor` has **no separate `mCurrentLeafID` member** — having both +`mBatch.mCenterLeafID` and a local copy would be redundant state that can get +out of sync. + +**`mIndices`** — accumulation buffer filled by `moveTo`. At the **top of each +`moveTo` call**, all `SIZE` vectors are zeroed. Index 0 is the NanoVDB +IndexGrid "not found / background" sentinel, so inactive lanes (those not set +in the returned `IndexMaskVec`) yield a well-defined background index rather +than stale data. Active lanes are then written by the straddling loop via +`where`-blend; in the straddling case the blend ensures majority-leaf results +are not overwritten when minority-leaf lanes are processed. + +**Stack footprint:** for WENO5, W=16: 18 × 16 × 8 bytes = **2.25 KB**. +Acceptable for a stack-local object within a VBM block kernel; would need care +if embedded in a larger persistent structure. + +--- + +## 7. Construction and leaf-ID monotonicity + +```cpp +StencilAccessor(const GridT& grid, uint32_t firstLeafID, uint32_t nExtraLeaves) + : mBatch(grid, firstLeafID) +#ifndef NDEBUG + , mNExtraLeaves(nExtraLeaves) +#endif +{} +``` + +Constructed once per VBM block. `firstLeafID = vbmHandle.hostFirstLeafID()[blockID]` +is the correct starting center leaf — the VBM block begins there by definition. + +`nExtraLeaves` is the number of distinct center-leaf advances the straddling loop +may make across the entire block (computed from the jumpMap by the caller). It is +used only as a debug-mode assert bound; it is not needed for correctness. Once the +implementation is vetted, remove the `#ifndef NDEBUG` member, the assert in `moveTo`, +and the constructor parameter — four targeted deletions with no restructuring. + +**Leaf-ID monotonicity invariant:** The VBM assigns leaf IDs in Morton order. +Within a block, `leafIndex[0..BlockWidth-1]` is **non-decreasing**: as the voxel +index advances, the leaf IDs can only stay the same or increase — never decrease. + +This invariant is load-bearing for the straddling loop: + +- `advance(centerLeafID() + 1)` is always correct: once all lanes for leaf N are + consumed from the current batch, no future batch will ever contain a lane for + leaf N. A simple increment is sufficient; no backward search is needed. +- The `while (any_of(activeMask))` loop is guaranteed to terminate: each iteration + either removes lanes from `activeMask` (progress toward `none_of`) or increments + the center leaf (progress toward the end of the block). At most `nLeaves` + center-leaf advances occur per batch; typically zero or one. +- The `BatchAccessor` neighbor cache is never invalidated "in reverse" — its + monotonic advance matches the monotonic leaf-ID layout. + +The instance persists for the entire block (across all `moveTo` calls) and is +destroyed when the block loop advances to the next block. + +--- + +## 8. `moveTo` — signature and body + +### 8a. Signature + +```cpp +IndexMaskVec moveTo(const uint32_t* leafIndex, // ptr to leafIndex[batchStart] + const uint16_t* voxelOffset); // ptr to voxelOffset[batchStart] +``` + +Takes raw pointers into the block's decoded inverse-map arrays at the current +batch offset. Returns the **initial** active-lane mask — `(leafSlice != +UnusedLeafIndex)` computed before the straddling loop — converted from +`LeafMaskVec` (uint32_t domain) to `IndexMaskVec` (uint64_t domain). + +The returned mask has two simultaneous readings: +- **Validity**: lane `i` held a real voxel (not a padding sentinel). +- **Usability**: `mIndices[k][i]` contains a valid stencil index for lane `i`. + +They are the same predicate because active lanes are written by `cachedGetValue` +and inactive lanes hold 0 (zeroed at the top of `moveTo`). The straddling loop +drains `activeMask` to zero internally; the initial mask is saved separately and +returned so the caller always receives a meaningful result. + +### 8b. Straddling loop body + +Mirrors the `while (any_of(activeMask))` loop from +`ex_stencil_gather_cpu/stencil_gather_cpu.cpp` (lines 698–789): + +``` +moveTo(leafIndex*, voxelOffset*): + + // Zero all tap slots — inactive lanes will hold index 0 (NanoVDB background). + for I in [0, SIZE): mIndices[I] = IndexVec(0) + + leafSlice ← load W values from leafIndex (LeafIdVec) + voVec ← load W values from voxelOffset (OffsetVec) + activeMask ← (leafSlice != UnusedLeafIndex) as LeafMaskVec + + // Save initial mask before the drain loop; this is what we return. + resultMask ← widen(activeMask) as IndexMaskVec + + if none_of(activeMask): return resultMask // entire batch inactive + + // Debug-only advance counter — see §7 for removal instructions. + #ifndef NDEBUG + uint32_t nAdvances = 0 + #endif + + while any_of(activeMask): + + leafMask ← activeMask & (leafSlice == LeafIdVec(mBatch.centerLeafID())) + + if none_of(leafMask): + // No lanes for this leaf — advance to next, assert bound. + mBatch.advance(mBatch.centerLeafID() + 1) + NANOVDB_ASSERT(++nAdvances <= mNExtraLeaves) + continue + + // Prefetch hull (compile-time fold over StencilT::Hull) + for each HullPoint H in StencilT::Hull: + mBatch.prefetch(voVec, leafMask) + + // Compute all taps and blend into mIndices + for each tap I in [0, SIZE): + using P = tuple_element_t + tmp ← IndexVec(0) + mBatch.cachedGetValue(tmp, voVec, leafMask) + where(leafMask, mIndices[I]) = tmp // blend: preserve other lanes + + activeMask &= !leafMask // remove processed lanes + + return resultMask +``` + +The `where`-blend is essential for correctness in straddling batches: lanes +belonging to a second center leaf must not overwrite results already written +for the first center leaf in the same `mIndices` slot. + +Note: `leafMask` is `LeafMaskVec` (uint32_t domain) while `mIndices[I]` is +`IndexVec` (uint64_t). The `where`-blend requires either a widening cast of +`leafMask` to `IndexMaskVec`, or a `where` overload in Simd.h that accepts +cross-width masks. Since both are W-bit masks, this is a bitmask reinterpret +with no data movement. + +### 8c. Hull and tap loops as compile-time folds + +Both loops expand to zero-overhead compile-time instantiations: + +```cpp +// Hull prefetch fold +[this, &voVec, &leafMask](std::index_sequence) { + using Hull = typename StencilT::Hull; + (mBatch.prefetch< + std::tuple_element_t::di, + std::tuple_element_t::dj, + std::tuple_element_t::dk + >(voVec, leafMask), ...); +}(std::make_index_sequence{}); + +// Tap cachedGetValue fold +[this, &voVec, &leafMask](std::index_sequence) { + using Taps = typename StencilT::Taps; + (blendOneTap(voVec, leafMask), ...); +}(std::make_index_sequence{}); +``` + +where `blendOneTap` calls `cachedGetValue` into a +temporary and then `where`-blends into `mIndices[I]`. + +--- + +## 9. `getValue()` — tap access by coordinate + +```cpp +template +const IndexVec& getValue() const { + constexpr int I = findIndex( + std::make_index_sequence{}); + static_assert(I >= 0, "StencilAccessor::getValue: tap not in stencil"); + return mIndices[I]; +} +``` + +**Inverse map** (`findIndex`): a `constexpr` fold over all `SIZE` taps, comparing +`(di,dj,dk)` against each `StencilPoint`. O(N) compile-time evaluations — +negligible for realistic stencil sizes. Resolved entirely at compile time; the +resulting `I` is a compile-time constant used as an array index. + +**`static_assert`**: catches invalid tap coordinates at compile time with a clear +message. Same safety guarantee as OpenVDB stencil's bounds check. + +**Lifetime**: the returned reference is valid only until the next `moveTo` call. +The caller must not cache the reference across batches. + +**Indexed access** — for kernels that iterate over all taps generically: + +```cpp +const IndexVec& operator[](int i) const { return mIndices[i]; } +``` + +Public, no bounds check in release. Same lifetime caveat as `getValue`. + +--- + +## 10. Caller-side usage pattern + +```cpp +// Construct once per VBM block +StencilAccessor stencil(grid, vbm.firstLeafID(blockID), nExtraLeaves); + +for (int b = 0; b < nBatches; ++b) { + auto active = stencil.moveTo(leafIndex + b*W, voxelOffset + b*W); + if (util::none_of(active)) continue; + + // Access by coordinate — compile-time slot resolution + auto idx_m3 = stencil.getValue<-3, 0, 0>(); // Simd + auto idx_m2 = stencil.getValue<-2, 0, 0>(); + // ... feed into WENO kernel alongside sidecar value fetches +} +// stencil destroyed here (end of block scope) +``` + +--- + +## 11. Ownership summary + +| Concern | Owner | +|---------|-------| +| Neighbor-leaf cache (`mNeighborLeafIDs[27]`, `mProbedMask`) | `BatchAccessor` | +| Cache population | `BatchAccessor::prefetch` (called by `StencilAccessor`) | +| Cache invalidation | `BatchAccessor` constructor + `advance()` — both clear `mProbedMask` and set `mCenterLeafID`; neither rebuilds the cache | +| `cachedGetValue` | `BatchAccessor` (called by `StencilAccessor`) | +| `advance(newLeafID)` | `BatchAccessor` — this is the only legitimate setter for `mCenterLeafID`; no raw setter exists (would bypass cache invalidation) | +| `mCenterLeafID` read access | `BatchAccessor::centerLeafID()` getter — exposed to `StencilAccessor`; no external setter | +| `leafMask` computation | `StencilAccessor` (derived inside `moveTo`) | +| Straddling loop | `StencilAccessor` | +| Hull prefetch sequencing | `StencilAccessor` | +| Tap fold and `where`-blend | `StencilAccessor` | +| `mIndices[SIZE]` storage and zeroing | `StencilAccessor` (zeroed at top of each `moveTo`) | +| `nExtraLeaves` debug bound | `StencilAccessor` (`#ifndef NDEBUG` member; removable) | +| Center-leaf lifetime (block scope) | Caller | + +--- + +## 12. Design decisions (all resolved) + +1. **`moveTo` return type — `IndexMaskVec` by value.** + The initial `activeMask = (leafSlice != UnusedLeafIndex)` is saved before the + straddling loop drains it to zero, widened from `LeafMaskVec` (uint32_t) to + `IndexMaskVec` (uint64_t), and returned. This gives the caller a mask that is + semantically aligned with the uint64_t `mIndices` data. The returned mask has + two simultaneous readings: which lanes held valid voxels (not padding sentinels), + and which lanes of `mIndices[k]` contain valid stencil indices. These are the + same predicate. No member copy is kept — the mask is consumed at the call site. + +2. **Inactive-lane `mIndices` values — zeroed at top of `moveTo`.** + `mIndices[0..SIZE-1]` is set to `IndexVec(0)` at the start of every `moveTo` + call. Index 0 is the NanoVDB IndexGrid "not found / background" sentinel, so + inactive lanes yield a well-defined background index rather than stale data. + The cost is `SIZE` × W zero-writes per call (~36 YMM stores for WENO5 W=16), + which is negligible. + +3. **`operator[]` — public, const-ref, no bounds check.** + ```cpp + const IndexVec& operator[](int i) const { return mIndices[i]; } + ``` + For kernels that iterate over all taps generically. Same lifetime as + `getValue`: valid only until the next `moveTo` call. + +4. **`StencilT` representation — `std::tuple...>` for both + `Taps` and `Hull`.** + The compile-time fold in §8c requires `std::tuple_element_t::di` to + be a compile-time constant. This is clean with a tuple-of-types but not with a + constexpr array indexed by a template parameter. `std::tuple_size_v` and + `std::tuple_element_t` are the sole introspection mechanisms needed. + +5. **`BatchAccessor::centerLeafID()` getter — add; no raw setter.** + ```cpp + uint32_t centerLeafID() const { return mCenterLeafID; } + ``` + The only change required in `BatchAccessor`. No raw setter: `advance()` is + the sole legitimate state transition for `mCenterLeafID`. Both the constructor + and `advance()` only **invalidate** the cache (clear `mProbedMask`); they do + not rebuild it. Cache population is entirely the caller's responsibility via + `prefetch()`, called by `StencilAccessor` inside the straddling loop. + +6. **`nExtraLeaves` — kept as a removable debug sanity check.** + Passed to the constructor, stored as `#ifndef NDEBUG uint32_t mNExtraLeaves` + member, asserted against a local `nAdvances` counter on each `advance()` call + inside `moveTo`. Termination is guaranteed by the VBM monotonicity invariant + (§7) without this bound; the bound is belt-and-suspenders only. To remove once + vetted: delete the `#ifndef NDEBUG` member block, the assert line, and the + `nExtraLeaves` constructor parameter — four targeted deletions. diff --git a/simd_test/Simd.h b/simd_test/Simd.h index c6e8bdc68b..cf89d36f9d 100644 --- a/simd_test/Simd.h +++ b/simd_test/Simd.h @@ -171,7 +171,7 @@ template struct SimdMask { std::array data{}; SimdMask() = default; - NANOVDB_SIMD_HOSTDEV explicit SimdMask(const bool* p, element_aligned_tag = {}) { + NANOVDB_SIMD_HOSTDEV explicit SimdMask(const bool* p, element_aligned_tag) { for (int i = 0; i < W; i++) data[i] = p[i]; } // Converting constructor: copy bool values from a mask over a different element type. @@ -200,7 +200,7 @@ struct Simd { Simd() = default; NANOVDB_SIMD_HOSTDEV Simd(T scalar) { data.fill(scalar); } // broadcast - NANOVDB_SIMD_HOSTDEV explicit Simd(const T* p, element_aligned_tag = {}) { // load + NANOVDB_SIMD_HOSTDEV explicit Simd(const T* p, element_aligned_tag) { // load for (int i = 0; i < W; i++) data[i] = p[i]; } NANOVDB_SIMD_HOSTDEV T operator[](int i) const { return data[i]; } From c605ec2f1b8fb26468c54b97887dd039081102bc Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Sat, 18 Apr 2026 12:11:42 -0500 Subject: [PATCH 27/60] refactor: move Simd.h, BatchAccessor.h, StencilAccessor.h to nanovdb/util/ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All three headers are now shared dependencies across multiple examples and are no longer specific to a single example directory. Consolidate into nanovdb/nanovdb/util/ to match the existing NanoVDB utility layout. - simd_test/Simd.h → nanovdb/util/Simd.h - ex_voxelBlockManager_host_cuda/BatchAccessor.h → nanovdb/util/BatchAccessor.h - ex_voxelBlockManager_host_cuda/StencilAccessor.h → nanovdb/util/StencilAccessor.h Update all #include paths in BatchAccessor.h, StencilAccessor.h, stencil_gather_cpu.cpp, and simd_test scratchpad files. Also add HaloStencilAccessor.md: design document for the dense halo buffer approach to CPU WENO5 stencil extraction (successor to StencilAccessor). Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Efty Sifakis --- .../stencil_gather_cpu.cpp | 6 +- .../HaloStencilAccessor.md | 337 ++++++++++++++++++ .../BatchAccessor.h | 2 +- {simd_test => nanovdb/nanovdb/util}/Simd.h | 0 .../StencilAccessor.h | 4 +- simd_test/StencilKernel.h | 2 +- 6 files changed, 344 insertions(+), 7 deletions(-) create mode 100644 nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/HaloStencilAccessor.md rename nanovdb/nanovdb/{examples/ex_voxelBlockManager_host_cuda => util}/BatchAccessor.h (99%) rename {simd_test => nanovdb/nanovdb/util}/Simd.h (100%) rename nanovdb/nanovdb/{examples/ex_voxelBlockManager_host_cuda => util}/StencilAccessor.h (99%) diff --git a/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp b/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp index 1fda6c31c1..9d2644200c 100644 --- a/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp +++ b/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp @@ -34,9 +34,9 @@ #include #include #include -#include // SimdMask, Simd, any_of, none_of, to_bitmask -#include "../ex_voxelBlockManager_host_cuda/BatchAccessor.h" // BatchAccessor -#include "../ex_voxelBlockManager_host_cuda/StencilAccessor.h" // StencilAccessor, Weno5Stencil +#include // SimdMask, Simd, any_of, none_of, to_bitmask +#include // BatchAccessor +#include // StencilAccessor, Weno5Stencil #include // __rdtsc, __rdtscp, _mm_lfence diff --git a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/HaloStencilAccessor.md b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/HaloStencilAccessor.md new file mode 100644 index 0000000000..2a99d4f15c --- /dev/null +++ b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/HaloStencilAccessor.md @@ -0,0 +1,337 @@ +# HaloStencilAccessor — Design Document + +## §1 Motivation + +`StencilAccessor` (see `StencilAccessor.md`) was measured at **537 cycles/voxel** +for the index-gathering phase alone (W=16, `Simd`, rdtsc on i9-285K). +The root cause is structural, not a tuning issue: + +- 18 taps × `Simd` = 18 YMM registers needed simultaneously for + `mIndices[]`, but x86 has only 16 YMM registers available. +- The compiler spills every tap slot to memory → every `mIndices[I]` write is a + store, every subsequent read is a load. +- Additionally, the gather step (reading float values from the sidecar using the + computed indices) is a second pass of scattered 64-bit indexed loads. + +**HaloStencilAccessor** eliminates both problems by replacing the +index-buffer-then-gather pattern with a local dense halo buffer that is filled +once per center-leaf run, from which all stencil values are extracted via +sequential, gather-free SIMD operations. + +--- + +## §2 Key idea: dense halo buffer + +For a given center leaf `L`, densify the sidecar float data for `L` and its +6 axis-aligned face-neighbor leaves into a contiguous local array: + +``` +float buf[16][16][16] +``` + +The center leaf occupies positions `[R..R+7]` in each dimension (where `R` is +the stencil radius cap, see §3). Any stencil tap at compile-time offset +`(di, dj, dk)` from center voxel `(i, j, k)` is then: + +```cpp +buf[R + i + di][R + j + dj][R + k + dk] +``` + +This is a branch-free, uniform address expression valid for **any** tap with +`|di|, |dj|, |dk| ≤ R`. No tree traversal, no ValueOnIndex arithmetic, no +leaf-pointer lookup occurs inside the stencil extraction loop. + +--- + +## §3 Why R=4 and 16³ + +For a center leaf of 8×8×8 voxels, supporting stencils up to radius R requires +`(8 + 2R)` voxels per dimension: + +| R | buffer side | buffer size | L1 resident? | +|---|-------------|-------------|--------------| +| 1 (box 3³) | 10³ | ~4 KB | yes | +| 3 (WENO5) | 14³ | ~11 KB | yes | +| **4 (cap)** | **16³** | **16 KB** | **yes** | +| 5 | 18³ | ~23 KB | yes | + +`R = 4` is chosen because: + +1. **16 KB fits comfortably in L1** (P-core L1d = 48 KB). The buffer stays + L1-resident throughout the processing of an entire center-leaf run. +2. **16 = 2³ × 2 → trivially simple addressing.** No `bi/ii` split; a flat + `[16][16][16]` array indexed by `R + i + di` (a 4-bit quantity, 0–15). +3. **Covers both WENO5 (R=3) and any future stencil up to R=4** without + redesign. +4. Powers-of-two strides (256, 16, 1) admit bit-shift addressing. + +**Axis-aligned stencils never access corner or edge neighbor slots.** For +WENO5, taps move in exactly one axis → only the 6 face-neighbor leaf regions +of the 16³ buffer are ever read. Corner/edge slots may be left zero-initialized +(background value) and are never consumed. + +### Buffer population + +Slots that are populated from the sidecar: + +| Region | Voxels | Source | +|--------|--------|--------| +| Center leaf `[R..R+7]³` | 8³ = 512 | sidecar[center leaf] | +| −x face slab `[0..R-1][R..R+7][R..R+7]` | R×8×8 = 256 | sidecar[−x neighbor] | +| +x face slab `[R+8..15][R..R+7][R..R+7]` | 256 | sidecar[+x neighbor] | +| −y, +y, −z, +z face slabs | 256 each | sidecar[respective neighbors] | + +**Total sidecar reads per center-leaf run: 512 + 6×256 = 2,048 floats = 8 KB.** +Corner and edge slots are zero-initialized once at buffer allocation. + +--- + +## §4 Run-based outer loop + +Within a VBM block (128 voxels), the `leafIndex[]` array produced by +`decodeInverseMaps` is sorted (the VBM is built in leaf-traversal order). +Voxels belonging to the same center leaf therefore form **contiguous runs**. + +A narrow-band block of 128 voxels sitting inside a single 8³ leaf spans at +most 1–3 distinct center leaves per block in practice. + +``` +for each VBM block: + scan leafIndex[0..127] for run boundaries + for each run (center leaf L, voxels i_start..i_end): + populate buf[16][16][16] from sidecar // 8 KB, amortized over run + process all voxels in [i_start..i_end] via the slice pipeline (§6) +``` + +The buffer fill and the 7 neighbor-leaf pointer lookups are amortized across +all voxels in the run. + +--- + +## §5 Stencil extraction: the fill → transpose → compact → transpose pipeline + +### §5a Why array-of-stencils [32][512] is easy to fill from buf[] + +For tap `t = (di, dj, dk)`, the 512 values `stencil[t][v]` for all center-leaf +voxel positions `v = i*64 + j*8 + k` are produced by iterating over the 64 +z-rows (fixed `i`, `j`; `k` = 0..7): + +``` +for each z-row (i, j): + source = &buf[R+i+di][R+j+dj][R+dk] // 8 consecutive floats in buf + dest = &stencil[t][i*64 + j*8] // 8 consecutive floats in row t + store 8 floats (one YMM) +``` + +Both source (L1-resident `buf`) and destination (stencil row `t`) are accessed +sequentially — **zero gathers**. All 18 WENO5 taps fill sequentially; the 14 +padding slots (see §5b) are either zero-initialized once or left unused. + +### §5b Padding taps to 32 + +WENO5 has 18 taps. Padding to 32 (next power of 2) gives: + +- Each voxel row in `stencil[*][32]` is exactly **128 bytes = 2 cache lines**, + naturally aligned. +- Compaction of one row (§5d) is exactly **4 full YMM loads + 4 full YMM + stores** — no masking, no partial registers, no scalar tail. +- `stencil[v]` address = `base + (v << 7)`: a single shift, no multiply. +- 14 padding slots are never read during WENO5 arithmetic and cost nothing. + +### §5c Layout duality and the 8×8 in-register transpose + +Two layouts are needed at different pipeline stages: + +| Layout | Alias | Size | Best for | +|--------|-------|------|----------| +| `float[32][N]` | SoA | 18/32 contiguous values per tap | sequential fill from buf | +| `float[N][32]` | AoS | 32 contiguous values per voxel | WENO5 arithmetic, compaction | + +Converting between them: the standard **8×8 SIMD float block transpose**. + +Given 8 YMM registers (one row each = 8 floats), the 8×8 transpose applies a +fixed 24-instruction register-only shuffle network +(`vunpcklps`/`vunpckhps` → `vunpcklpd`/`vunpckhpd` → `vperm2f128`) +and stores 8 result YMM registers. All shuffles are register-to-register; +no intermediate memory is touched between the 8 loads and 8 stores. + +For an M×N matrix (both M and N multiples of 8): +- Number of 8×8 block transposes: (M/8) × (N/8) +- Uses exactly 16 YMM registers (8 input + 8 output) — exact fit + +### §5d Compaction: AoS[512][32] → AoS[N_active][32] + +Given `stencil[512][32]` (AoS, all leaf positions) and the `voxelOffset[]` +list of N_active active voxels (0–511, from `decodeInverseMaps`): + +``` +for v in 0..N_active: + ymm0..3 = load stencil[voxelOffset[v]][0..31] // 4 × YMM load + store compact[v][0..31] // 4 × YMM store +``` + +Total: N_active × 8 YMM operations. Rows are 128-byte aligned; no masking. + +### §5e Slicing: keep the working set in L1 + +`stencil[32][512]` = 64 KB — L2 resident. Instead, process 4 slices of 128 +voxels each: + +``` +stencil[32][128] = 32 × 128 × 4 = 16 KB ← L1 resident +``` + +With the 16³ `buf` (16 KB) and the stencil slice (16 KB), the total working +set is **32 KB**, well within L1 (48 KB). All fill and transpose phases stay +in L1 — no L2 traffic during data transformation. + +Slice boundaries: voxel positions 0–127, 128–255, 256–383, 384–511 within the +center leaf. Each slice is processed identically; active-voxel results are +accumulated across slices. + +--- + +## §6 Per-slice pipeline (full detail) + +For each of the 4 slices `[s*128 .. (s+1)*128 - 1]`: + +``` +Step 1 Fill stencil[32][128] + ── For each tap t in 0..17: + For each z-row (i,j) with voxels in this slice: + load 8 floats from buf (L1) → store to stencil[t][row] (L1) + ── Cost: 9 KB reads + 9 KB writes, all L1. + +Step 2 Transpose [32][128] → [128][32] + ── 64 × 8×8 in-register block transposes. + ── Cost: 16 KB L1 read + 16 KB L1 write; all shuffles register-only. + +Step 3 Compact [128][32] → [N_slice][32] + ── For each active voxel in this slice: 4 YMM loads + 4 YMM stores (L1). + ── N_slice ≤ 128. Cost: ≤ 16 KB L1 read + ≤ 16 KB L1 write. + +Step 4 Transpose [N_slice][32] → [32][N_slice] + ── ≤ 64 × 8×8 in-register block transposes. + ── Cost: ≤ 16 KB L1 read + write. + +Step 5 WENO5 arithmetic on stencil[32][N_slice] + ── For each of the 18 taps: sequential load of N_slice floats (L1). + ── ~700 FLOPs/voxel, vectorised over N_slice voxels in YMM batches. + ── Cost: ~700 × N_slice / 32 cycles (2 FMA units × 8 floats). + +Step 6 Write output + ── N_slice scalar (or YMM-masked) stores to output sidecar. +``` + +--- + +## §7 Performance analysis + +### Measured baseline (StencilAccessor, W=16) + +- **8,586 TSC ticks/batch** (16 voxels) → **537 cycles/voxel** +- Gather phase only; WENO5 arithmetic not yet included. +- Root cause: register spilling of 18 × `Simd`. + +### CPU parameters (i9-285K, TSC reference clock) + +| Resource | Throughput | +|----------|-----------| +| L1 read | 64 bytes/cycle (~237 GB/s at 3.7 GHz) | +| L1 write | 32 bytes/cycle (~118 GB/s) | +| L2 read | 32 bytes/cycle (~118 GB/s) | +| FMA peak | 32 FLOPs/cycle (2 units × 8 floats × 2 FLOPs) | + +### Estimated cost per slice (128 voxels, ~32 active) + +| Step | Data touched | Estimated cycles | +|------|-------------|-----------------| +| 1 — fill stencil[32][128] | 9 KB L1 write | ~288 | +| 2 — transpose [32][128]→[128][32] | 16 KB L1 r+w | ~512 | +| 3 — compact | 16 KB L1 r+w | ~512 | +| 4 — transpose compact output | ≤16 KB L1 r+w | ~256 | +| 5 — WENO5 arithmetic (32 active) | 18×32×4=2 KB L1 read | ~700 | +| **Total per slice** | | **~2,268** | + +Per voxel (32 active): ~71 cycles/voxel — including full WENO5 arithmetic. + +Four slices + buffer fill (~300 cycles amortised): ~9,372 cycles per 128-voxel +block → ~**73 cycles/voxel total**. + +### Comparison + +| Metric | StencilAccessor | HaloStencilAccessor (est.) | +|--------|-----------------|---------------------------| +| Gather phase only | 537 cycles/voxel | ~50 cycles/voxel | +| Gather + WENO5 | not measured | ~73 cycles/voxel | +| Dominant bottleneck | register spilling | L1 write bandwidth | +| Gathers in hot loop | yes (scattered 64-bit) | **none** | + +**Estimated speedup over StencilAccessor: 7–10× on the gather phase; +gather + WENO5 combined comes in under the cost of gathering alone in the +old design.** + +--- + +## §8 Future optimisations + +### §8a Fuse fill + first transpose (steps 1+2) + +Fill 8 tap rows × 8 voxels into YMM registers, immediately transpose the +8×8 block and store in AoS order. Eliminates one full pass over the 16 KB +stencil slice; saves ~512 cycles per slice. + +### §8b Fuse WENO5 arithmetic with step 4 + +Rather than materialising `stencil[32][N_slice]` (SoA), compute WENO5 +directly from `stencil[N_slice][32]` (AoS) by loading each voxel's 32-float +row and computing vertically. Eliminates step 4 entirely. Effective when +N_slice is small (dense run ≤ 32 voxels). + +### §8c Software pipelining across slices + +While WENO5 runs on slice `s`, fill the stencil buffer for slice `s+1`. +The two phases touch disjoint L1 regions; overlap is feasible. + +### §8d TBB parallel_for over blocks + +VBM blocks are independent (grid is read-only). Each thread owns its +block's 32 KB working set (buf + slice); no synchronisation required. +Expected 7–8× speedup across 8 P-cores. + +--- + +## §9 Design decisions summary + +| Decision | Choice | Rationale | +|----------|--------|-----------| +| Stencil radius cap | R = 4 | 16³ = 16 KB, L1 resident; power-of-2 | +| Tap count padding | 18 → 32 | YMM-aligned compaction (4 full registers) | +| Dense or sparse fill | Dense (all 512 leaf positions) | Branchless; cheaper than compaction logic during fill | +| Slice size | 128 voxels (4 slices of 512) | buf(16 KB) + slice(16 KB) = 32 KB ≤ L1 | +| Transpose kernel | 8×8 in-register float block | 16 YMM registers, no memory between load/store | +| Compaction order | After first transpose | Driven by sorted voxelOffset[] from decodeInverseMaps | +| Outer loop | Run-based (by center leaf) | Amortises buffer fill over entire run | + +--- + +## §10 Open questions + +1. **Leaf-pointer resolution**: buffer fill still requires resolving 6 face-neighbor + leaf pointers via tree traversal. Should this reuse BatchAccessor's + neighbor-lookup machinery, or be a standalone 6-pointer lookup? + +2. **Missing neighbors**: if a face-neighbor leaf does not exist in the grid, + the corresponding slab should be zero-filled (background = 0 for + ValueOnIndex grids). Confirm zero-init strategy for absent neighbors. + +3. **Non-uniform active-voxel density**: some slices may have 0 active voxels + (entire slice inactive). Add a slice-skip predicate? + +4. **Output sidecar write-back**: the `voxelOffset` of each active voxel gives + its ValueOnIndex; use that to write the WENO5 result directly to the output + sidecar. Confirm index arithmetic. + +5. **Tap padding slots (18..31)**: never read in WENO5 arithmetic. Can be + left uninitialised (no UB since never read) or zero-filled once. Decide + at implementation time. diff --git a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.h b/nanovdb/nanovdb/util/BatchAccessor.h similarity index 99% rename from nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.h rename to nanovdb/nanovdb/util/BatchAccessor.h index 9de8933334..077bb6921b 100644 --- a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.h +++ b/nanovdb/nanovdb/util/BatchAccessor.h @@ -38,7 +38,7 @@ #pragma once #include -#include // simd_traits, scalar_traits, Simd, SimdMask +#include #include #include #include diff --git a/simd_test/Simd.h b/nanovdb/nanovdb/util/Simd.h similarity index 100% rename from simd_test/Simd.h rename to nanovdb/nanovdb/util/Simd.h diff --git a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilAccessor.h b/nanovdb/nanovdb/util/StencilAccessor.h similarity index 99% rename from nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilAccessor.h rename to nanovdb/nanovdb/util/StencilAccessor.h index 2e4bd84a0f..eb562eb8b1 100644 --- a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilAccessor.h +++ b/nanovdb/nanovdb/util/StencilAccessor.h @@ -34,8 +34,8 @@ #pragma once #include -#include -#include "BatchAccessor.h" +#include +#include #include #include diff --git a/simd_test/StencilKernel.h b/simd_test/StencilKernel.h index f67f7f940a..44750e290c 100644 --- a/simd_test/StencilKernel.h +++ b/simd_test/StencilKernel.h @@ -1,5 +1,5 @@ #pragma once -#include "Simd.h" +#include // Portable __hostdev__ annotation — no-op outside CUDA, matching NanoVDB convention. #ifndef __CUDACC__ From 432bec46e6df9d9c66149f69b80565ff92b8e0ae Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Sat, 18 Apr 2026 12:13:53 -0500 Subject: [PATCH 28/60] refactor: move BatchAccessor.md, StencilAccessor.md, HaloStencilAccessor.md to nanovdb/util/ Keep design docs co-located with their headers now that Simd.h, BatchAccessor.h, and StencilAccessor.h live in nanovdb/nanovdb/util/. VBM-specific docs (StencilGather.md, VBMImplementationKnowledge.md, VoxelBlockManagerContext.md) remain in ex_voxelBlockManager_host_cuda/. Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Efty Sifakis --- .../ex_voxelBlockManager_host_cuda => util}/BatchAccessor.md | 0 .../HaloStencilAccessor.md | 0 .../ex_voxelBlockManager_host_cuda => util}/StencilAccessor.md | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename nanovdb/nanovdb/{examples/ex_voxelBlockManager_host_cuda => util}/BatchAccessor.md (100%) rename nanovdb/nanovdb/{examples/ex_voxelBlockManager_host_cuda => util}/HaloStencilAccessor.md (100%) rename nanovdb/nanovdb/{examples/ex_voxelBlockManager_host_cuda => util}/StencilAccessor.md (100%) diff --git a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.md b/nanovdb/nanovdb/util/BatchAccessor.md similarity index 100% rename from nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.md rename to nanovdb/nanovdb/util/BatchAccessor.md diff --git a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/HaloStencilAccessor.md b/nanovdb/nanovdb/util/HaloStencilAccessor.md similarity index 100% rename from nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/HaloStencilAccessor.md rename to nanovdb/nanovdb/util/HaloStencilAccessor.md diff --git a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilAccessor.md b/nanovdb/nanovdb/util/StencilAccessor.md similarity index 100% rename from nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilAccessor.md rename to nanovdb/nanovdb/util/StencilAccessor.md From 8db02182d20bcb5fb2e9a84ef3618ec4240c05a4 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Sat, 18 Apr 2026 12:17:37 -0500 Subject: [PATCH 29/60] cleanup: remove simd_test/ scratchpad directory INVESTIGATION.md, StencilKernel.h, and lift_test.cpp were investigation artifacts from the superseded liftToSimd / generic-T approach that preceded BatchAccessor and Simd.h. Relevant findings are preserved in nanovdb/util/BatchAccessor.md. Untracked scratch files (assembly outputs, binaries, codegen experiments) deleted alongside. Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Efty Sifakis --- simd_test/INVESTIGATION.md | 293 ------------------------------------- simd_test/StencilKernel.h | 165 --------------------- simd_test/lift_test.cpp | 65 -------- 3 files changed, 523 deletions(-) delete mode 100644 simd_test/INVESTIGATION.md delete mode 100644 simd_test/StencilKernel.h delete mode 100644 simd_test/lift_test.cpp diff --git a/simd_test/INVESTIGATION.md b/simd_test/INVESTIGATION.md deleted file mode 100644 index 9be7f4bd22..0000000000 --- a/simd_test/INVESTIGATION.md +++ /dev/null @@ -1,293 +0,0 @@ -# liftToSimd / Generic-T SIMD Vectorization Investigation - -This document captures the design ideas, experiments, findings, and open questions -from an in-progress investigation into auto-vectorizing a scalar stencil kernel -for the VoxelBlockManager CPU port. Written as a reference for resuming the -investigation in a future session. - ---- - -## 1. Motivation - -The VoxelBlockManager CPU port (branch `vbm-cpu-port`) processes voxels in batches -of `SIMDw = 16` (one AVX2 register width of uint16_t). For each batch the same -stencil computation is applied to every lane. The goal is to write the stencil -physics **once** as a scalar, `__hostdev__`-compatible function (usable unmodified on -the GPU), and automatically derive an auto-vectorized CPU batch kernel from it. - ---- - -## 2. Approach A: `liftToSimd` Pattern (superseded) - -### Core Idea - -A scalar kernel with signature `ScalarTupleOut kernel(ScalarTupleIn)` is lifted to -W lanes by replacing every `T` in the tuple types with `std::array` (SoA -layout). A W-iteration loop extracts the i-th element from each input array, calls -the scalar kernel, and stores results back. This loop is the auto-vectorization -target. - -```cpp -template -auto liftToSimd(ScalarFn f) { - return [f](const auto& simdIn, auto& simdOut) { - for (int i = 0; i < W; i++) { - auto scalarIn = extractSlice(simdIn, i, ...); - auto scalarOut = f(scalarIn); - storeSlice(simdOut, i, scalarOut, ...); - } - }; -} -``` - -### Outcome - -Clang 18 vectorizes the unmodified kernel (with `std::max` and `bool isOutside`) -producing a full ymm path with a runtime alias check. GCC 13 does not vectorize in -any attempted form (see §5). - -### Why Superseded - -1. The scalar kernel takes a tuple, not individual arguments, and cannot be - templated on `T` directly — it is a separate code path from the GPU kernel. -2. Vectorization relies entirely on the auto-vectorizer seeing through the tuple - extraction loop, which GCC cannot do. - ---- - -## 3. Approach B: Generic-T Pattern (current) - -### Core Idea - -Write the kernel **once** as a template on its value type `T`: - -- `T = float` → scalar path, `__hostdev__`-compatible, used on GPU per-thread -- `T = Simd` → W-wide SIMD path, used on CPU per-batch - -All arithmetic operators, `min`, `max`, and `where` are overloaded for both `float` -and `Simd`, so the same source compiles correctly for both contexts with -zero `#ifdef`. - -### `where()` — the key primitive - -`bool isOutside ? a : b` cannot be used with a SIMD mask. `where(mask, a, b)` -replaces it: - -```cpp -// Scalar (T=float): plain ternary — GPU path -template T where(bool mask, T a, T b) { return mask ? a : b; } - -// SIMD (T=Simd): lane-wise blend → VBLENDVPS, no branch -template -Simd where(SimdMask mask, Simd a, Simd b); -``` - -`v0 > T(isoValue)` deduces to `bool` when `T=float` and `SimdMask` when -`T=Simd`, so the `where()` call resolves correctly in both cases. - -### Class hierarchy - -`WENO5` and `GodunovsNormSqrd` are free functions in `StencilKernel.h`, -mirroring their counterparts in `Stencils.h`. The stencil data and compute methods -live in a two-level class hierarchy: - -``` -BaseStencilKernel mValues[SIZE], mDx2, mInvDx2 — pure data - | -WenoStencilKernel normSqGrad(), ... — pure compute -``` - -No grid coupling, no accessor, no `moveTo()`. The VBM gather populates `mValues` -directly; `normSqGrad()` is then called on the populated kernel object. - -### GPU / CPU call sites - -```cpp -// GPU: one thread, scalar — fill from per-thread stencil gather -WenoStencilKernel sk(dx); -for (int n = 0; n < 19; n++) sk[n] = gathered_scalar_values[n]; -float result = sk.normSqGrad(isoValue); - -// CPU: one batch, SIMD — fill from VBM batch gather -WenoStencilKernel> sk(dx); -for (int n = 0; n < 19; n++) sk[n] = gathered_simd_values[n]; -Simd result = sk.normSqGrad(isoValue); -``` - -### Relationship to legacy WenoStencil - -The existing `BaseStencil` / `WenoStencil` hierarchy in -`Stencils.h` couples data storage to a grid accessor and a `moveTo()` cursor — a -sequential, single-threaded API incompatible with VBM batch processing. The kernel -hierarchy is designed as its eventual replacement. During transition, the legacy -classes can simply derive from the kernel classes to inherit the compute methods -without disruption. - -NVCC's demand-driven template instantiation ensures `WenoStencilKernel>` -is never compiled for device. - ---- - -## 4. `nanovdb::util::Simd` — two backends - -`simd_test/Simd.h` (destined for `nanovdb/util/`) provides `Simd`, -`SimdMask`, `min`, `max`, and `where` with two interchangeable implementations -selected automatically at compile time. Suppress Backend A with -`-DNANOVDB_NO_STD_SIMD` to force the fallback. - -### Backend A: `std::experimental::simd` (C++26 / Parallelism TS v2) - -Activated when `` is available, -`__cpp_lib_experimental_parallel_simd` is defined, and `NANOVDB_NO_STD_SIMD` is not -set. - -`Simd` and `SimdMask` are **pure type aliases** for -`std::experimental::fixed_size_simd` and -`std::experimental::fixed_size_simd_mask`. All arithmetic delegates to the -standard types; the compiler emits native vector instructions without relying on the -auto-vectorizer. - -The TS v2 `where(mask, v)` is a 2-arg masked-assignment proxy, not a 3-arg select. -A thin free function adapts it: -```cpp -template -Simd where(SimdMask mask, Simd a, Simd b) { - auto result = b; - stdx::where(mask, result) = a; - return result; -} -``` - -### Backend B: `std::array` (default, C++17) - -`Simd` wraps `std::array` with element-wise operator loops. -`__hostdev__`-annotated throughout for CUDA compatibility. - -**GCC vectorization note**: GCC's failure to auto-vectorize in §5 was specific to -Approach A's outer-lane loop pattern, where GCC could not see through `std::tuple` -struct indirection in GIMPLE. Backend B's element-wise operator loops (e.g. -`for (int i = 0; i < W; i++) r[i] = a[i] + b[i]`) are a completely different target -— fixed-count, no struct indirection — and GCC does auto-vectorize them when used -with the Generic-T kernel class hierarchy (see §6). - -### element_aligned_tag — portable load/store descriptor - -`nanovdb::util::element_aligned_tag` and `nanovdb::util::element_aligned` are always -present. In Backend A they alias `stdx::element_aligned_tag` (same type the stdx -constructors expect); in Backend B they are a standalone dummy struct (ignored). -This makes the load constructor `Simd(const T*, element_aligned)` portable across -both backends and forward-compatible with `std::simd`. - -### C++26 migration path - -When `std::simd` lands in ``, migration is a one-line change: replace the -`stdx` detection block with `#if __cpp_lib_simd` and `std::experimental` with `std`. -The kernel source, `element_aligned_tag`, and all call sites are unchanged. - ---- - -## 5. Vectorization Experiments and Findings (Approach A) - -Platform: x86-64, AVX2, Ubuntu. GCC 13. Clang 18. -Base flags: `-O3 -march=native -std=c++17` - -> **Warning — GCC false positive diagnostics**: `-fopt-info-vec-missed` / `-fopt-info-vec` -> can report `optimized: loop vectorized using 32 byte vectors` for code *outside* the -> hot loop. Assembly inspection is the only ground truth — always verify with -> `grep -c 'ymm'` and confirm the instructions fall inside the target function. - -| Experiment | Kernel | GCC | Clang | -|---|---|---|---| -| 1 | Simple Laplacian (pure arithmetic) | Yes | Yes | -| 2 | WENO5 sum, no conditionals | Yes | Yes | -| 3 | Full `normSqGrad`, `bool isOutside` | **No** (control flow) | **Yes** | -| 4 | Same, `isOutside` = constant `true` | No (control flow in `std::max`) | Yes | -| 5 | `fmaxf` + `float sign` | No (struct-access blocker) | Yes | -| 6 | `fmaxf` + `-ffinite-math-only` | No (false positive diagnostic) | Yes | -| 7 | `__attribute__((optimize("finite-math-only")))` | No (doesn't propagate) | Yes | -| 8 | `__builtin_fmaxf` + `float sign` | No (struct-access blocker) | Yes | -| 9 | Pointer-cache + `__builtin_fmaxf` | No (call-clobbers-memory) | Yes | -| 10 | Flat `float[N][W]` arrays | No (gather stride) | n/a | - -**Conclusion for Approach A**: GCC 13 cannot auto-vectorize the `liftToSimd` pattern -in any attempted form. The root cause is GCC's inability to see through `std::tuple`'s -recursive-inheritance struct layout in GIMPLE — not a limitation of Backend B per se. - ---- - -## 6. Vectorization Results (Approach B, assembly-verified) - -GCC 13, AVX2, `-O3 -march=native -std=c++17`. ymm counts per function (assembly-inspected). - -### Backend A (`std::experimental::simd`, auto-detected) - -| Function | ymm instructions | -|---|---| -| `WenoStencilKernel::normSqGrad` | 945 (WENO5 inlined ×6) | -| `GodunovsNormSqrd` | 289 (out-of-line) | -| `min` / `max` | 10 each | -| `runSimdNormSqGrad` (test wrapper) | 0 (call shell only) | -| **Total** | **1267** | - -### Backend B (`std::array`, forced with `-DNANOVDB_NO_STD_SIMD`) - -| Function | ymm instructions | -|---|---| -| `WenoStencilKernel::normSqGrad` | 365 | -| `WENO5` | 137 (out-of-line) | -| `GodunovsNormSqrd` | 117 (out-of-line) | -| **Total** | **619** | - -Both backends pass all 16 lanes. Backend B vectorizes via GCC's auto-vectorizer on -the fixed-count element-wise operator loops — the struct-access limitation from -Approach A does not apply here. - -Key instructions in both paths: `vfmadd*ps`, `vsubps`, `vmulps`, `vmaxps`, -`vminps`, `vblendvps`, `vcmpnltps`. - ---- - -## 7. Open Questions / Next Steps - -- **Benchmarking**: Throughput of the vectorized path vs. scalar not yet measured on - representative VBM data. -- **Integration**: Move `Simd.h` to `nanovdb/util/Simd.h`; move `StencilKernel.h` - to `nanovdb/math/`; have legacy `WenoStencil` derive from `WenoStencilKernel` - during transition, then retire it. -- **`` header**: Clang 18 provides `` but not ``. - Once `` is available, the detection guard simplifies to `#if __cpp_lib_simd`. -- **Clang assembly verification**: Clang not yet installed on this machine. Previous - results (691 ymm flat in hot function, free-function version) predate the - class-based refactor; re-verification pending. - ---- - -## 8. File Reference - -| File | Purpose | -|------|---------| -| `simd_test/Simd.h` | `nanovdb::util::Simd` — two backends, auto-detected (prototype for `nanovdb/util/`) | -| `simd_test/StencilKernel.h` | `BaseStencilKernel`, `WenoStencilKernel`, `WENO5`, `GodunovsNormSqrd` (prototype for `nanovdb/math/`) | -| `simd_test/lift_test.cpp` | Correctness test: SIMD vs scalar reference via `WenoStencilKernel` | -| `nanovdb/nanovdb/math/Stencils.h` | Original scalar `WENO5`, `GodunovsNormSqrd`, `WenoStencil::normSqGrad` | -| `nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilGather.md` | Per-block stencil gather design doc | -| `nanovdb/nanovdb/tools/VoxelBlockManager.h` | CPU VBM implementation | - -Build commands: -```sh -# GCC, Backend A (std::experimental::simd, auto-detected): -g++ -O3 -march=native -std=c++17 -o lift_test lift_test.cpp - -# GCC, Backend B (std::array, forced): -g++ -O3 -march=native -std=c++17 -DNANOVDB_NO_STD_SIMD -o lift_test lift_test.cpp - -# Clang, Backend A (std::experimental::simd, C++26): -clang++-18 -O3 -march=native -std=c++26 \ - -I/usr/include/c++/13 -I/usr/include/x86_64-linux-gnu/c++/13 \ - -o lift_test lift_test.cpp - -# Clang, Backend B (std::array, C++17 or forced): -clang++-18 -O3 -march=native -std=c++17 \ - -I/usr/include/c++/13 -I/usr/include/x86_64-linux-gnu/c++/13 \ - -o lift_test lift_test.cpp -``` diff --git a/simd_test/StencilKernel.h b/simd_test/StencilKernel.h deleted file mode 100644 index 44750e290c..0000000000 --- a/simd_test/StencilKernel.h +++ /dev/null @@ -1,165 +0,0 @@ -#pragma once -#include - -// Portable __hostdev__ annotation — no-op outside CUDA, matching NanoVDB convention. -#ifndef __CUDACC__ -# ifndef __hostdev__ -# define __hostdev__ -# endif -#endif - -// --------------------------------------------------------------------------- -// Prototype of the kernel-only stencil hierarchy for NanoVDB. -// -// Defines BaseStencilKernel and WenoStencilKernel, where T is: -// float — scalar, __hostdev__-compatible, GPU per-thread path -// Simd — W-wide SIMD, CPU per-batch path -// -// These are pure data + compute classes with no grid coupling. They are -// intended to replace the compute portions of BaseStencil / WenoStencil in -// nanovdb/math/Stencils.h, with the legacy accessor-based classes deriving -// from these to retain backward compatibility during transition. -// -// Free functions WENO5 and GodunovsNormSqrd mirror their counterparts in -// Stencils.h, templatized on T so they work for both scalar and SIMD. -// --------------------------------------------------------------------------- - -namespace nanovdb { -namespace math { - -using namespace nanovdb::util; // min, max, where, Simd, SimdMask - -// --------------------------------------------------------------------------- -// WENO5 — fifth-order upwind interpolation, templated on T. -// Mirrors WENO5 in Stencils.h; here RealT == T throughout. -// --------------------------------------------------------------------------- -template -__hostdev__ inline T WENO5(T v1, T v2, T v3, T v4, T v5, float scale2 = 1.f) -{ - const float C = 13.f / 12.f; - const T eps = T(1.0e-6f * scale2); - - const T d12 = v1 - 2.f*v2 + v3; - const T d13 = v1 - 4.f*v2 + 3.f*v3; - const T d23 = v2 - 2.f*v3 + v4; - const T d24 = v2 - v4; - const T d34 = v3 - 2.f*v4 + v5; - const T d35 = 3.f*v3 - 4.f*v4 + v5; - - const T w1 = C*d12*d12 + 0.25f*d13*d13 + eps; - const T w2 = C*d23*d23 + 0.25f*d24*d24 + eps; - const T w3 = C*d34*d34 + 0.25f*d35*d35 + eps; - - const T A1 = 0.1f / (w1*w1); - const T A2 = 0.6f / (w2*w2); - const T A3 = 0.3f / (w3*w3); - - return (A1*(2.f*v1 - 7.f*v2 + 11.f*v3) + - A2*(5.f*v3 - v2 + 2.f*v4) + - A3*(2.f*v3 + 5.f*v4 - v5)) / (6.f*(A1+A2+A3)); -} - -// --------------------------------------------------------------------------- -// GodunovsNormSqrd — templated on T (value type) and MaskT (mask type). -// Mirrors GodunovsNormSqrd in Stencils.h. -// The if/else branch in the original is replaced by unconditionally computing -// both the outside and inside terms and blending via where(), so the SIMD -// path produces a lane-wise select with no control flow divergence. -// --------------------------------------------------------------------------- -template -__hostdev__ inline T GodunovsNormSqrd(MaskT isOutside, - T dP_xm, T dP_xp, - T dP_ym, T dP_yp, - T dP_zm, T dP_zp) -{ - const T zero(0.f); - T outside = max(max(dP_xm, zero) * max(dP_xm, zero), - min(dP_xp, zero) * min(dP_xp, zero)) // (dP/dx)^2 - + max(max(dP_ym, zero) * max(dP_ym, zero), - min(dP_yp, zero) * min(dP_yp, zero)) // (dP/dy)^2 - + max(max(dP_zm, zero) * max(dP_zm, zero), - min(dP_zp, zero) * min(dP_zp, zero)); // (dP/dz)^2 - - T inside = max(min(dP_xm, zero) * min(dP_xm, zero), - max(dP_xp, zero) * max(dP_xp, zero)) // (dP/dx)^2 - + max(min(dP_ym, zero) * min(dP_ym, zero), - max(dP_yp, zero) * max(dP_yp, zero)) // (dP/dy)^2 - + max(min(dP_zm, zero) * min(dP_zm, zero), - max(dP_zp, zero) * max(dP_zp, zero)); // (dP/dz)^2 - - return where(isOutside, outside, inside); -} - -// --------------------------------------------------------------------------- -// BaseStencilKernel -// -// Owns mValues[SIZE] and the grid spacing parameters mDx2 / mInvDx2. -// No grid accessor, no moveTo — pure data container for stencil compute. -// --------------------------------------------------------------------------- -template -class BaseStencilKernel -{ -protected: - T mValues[SIZE]{}; - float mDx2{1.f}, mInvDx2{1.f}; - -public: - __hostdev__ BaseStencilKernel() = default; - __hostdev__ explicit BaseStencilKernel(float dx) - : mDx2(dx * dx), mInvDx2(1.f / (dx * dx)) {} - - __hostdev__ T& operator[](int n) { return mValues[n]; } - __hostdev__ const T& operator[](int n) const { return mValues[n]; } - - __hostdev__ static constexpr int size() { return SIZE; } -}; - -// --------------------------------------------------------------------------- -// WenoStencilKernel -// -// Derives from BaseStencilKernel and provides normSqGrad() and -// related compute methods. Mirrors the compute interface of WenoStencil in -// nanovdb/math/Stencils.h. -// -// mValues layout (matching WenoPt::idx): -// [0] = center ( 0, 0, 0) -// [1]..[6] = x-axis (-3,-2,-1, +1,+2,+3) -// [7]..[12] = y-axis (-3,-2,-1, +1,+2,+3) -// [13]..[18] = z-axis (-3,-2,-1, +1,+2,+3) -// --------------------------------------------------------------------------- -template -class WenoStencilKernel : public BaseStencilKernel -{ - using Base = BaseStencilKernel; - -protected: - using Base::mValues; - using Base::mDx2; - using Base::mInvDx2; - -public: - using Base::Base; - - /// @brief Return the norm-squared of the WENO upwind gradient at the - /// buffered stencil location, using Godunov's scheme. - /// Matches WenoStencil::normSqGrad() in Stencils.h. - __hostdev__ inline T normSqGrad(float isoValue = 0.f) const - { - const T* v = mValues; - const T - dP_xm = WENO5(v[ 2]-v[ 1], v[ 3]-v[ 2], v[ 0]-v[ 3], v[ 4]-v[ 0], v[ 5]-v[ 4], mDx2), - dP_xp = WENO5(v[ 6]-v[ 5], v[ 5]-v[ 4], v[ 4]-v[ 0], v[ 0]-v[ 3], v[ 3]-v[ 2], mDx2), - dP_ym = WENO5(v[ 8]-v[ 7], v[ 9]-v[ 8], v[ 0]-v[ 9], v[10]-v[ 0], v[11]-v[10], mDx2), - dP_yp = WENO5(v[12]-v[11], v[11]-v[10], v[10]-v[ 0], v[ 0]-v[ 9], v[ 9]-v[ 8], mDx2), - dP_zm = WENO5(v[14]-v[13], v[15]-v[14], v[ 0]-v[15], v[16]-v[ 0], v[17]-v[16], mDx2), - dP_zp = WENO5(v[18]-v[17], v[17]-v[16], v[16]-v[ 0], v[ 0]-v[15], v[15]-v[14], mDx2); - - return T(mInvDx2) * GodunovsNormSqrd(v[0] > T(isoValue), - dP_xm, dP_xp, - dP_ym, dP_yp, - dP_zm, dP_zp); - } -}; - -} // namespace math -} // namespace nanovdb diff --git a/simd_test/lift_test.cpp b/simd_test/lift_test.cpp deleted file mode 100644 index 297e733063..0000000000 --- a/simd_test/lift_test.cpp +++ /dev/null @@ -1,65 +0,0 @@ -#include "StencilKernel.h" -#include -#include - -using namespace nanovdb::util; -using namespace nanovdb::math; - -// --------------------------------------------------------------------------- -// SIMD wrapper — noinline to prevent constant-folding in the test -// --------------------------------------------------------------------------- -constexpr int W = 16; -using FloatSimd = Simd; - -__attribute__((noinline)) -FloatSimd runSimdNormSqGrad(const FloatSimd sv[19], float dx, float isoValue) -{ - WenoStencilKernel sk(dx); - for (int n = 0; n < 19; n++) sk[n] = sv[n]; - return sk.normSqGrad(isoValue); -} - -// --------------------------------------------------------------------------- -// Reference: scalar path — same kernel class instantiated with T=float -// --------------------------------------------------------------------------- -float refNormSqGrad(const float v[19], float dx, float isoValue = 0.f) -{ - WenoStencilKernel sk(dx); - for (int n = 0; n < 19; n++) sk[n] = v[n]; - return sk.normSqGrad(isoValue); -} - -// --------------------------------------------------------------------------- -int main() -{ - const float dx = 0.1f; - - // Storage: SoA layout — inData[n] holds W lane values for stencil position n - float inData[19][W]{}; - float refValues[W][19]; - - for (int i = 0; i < W; i++) - for (int n = 0; n < 19; n++) { - refValues[i][n] = std::sin(n * 0.3f + i * 0.5f); - inData[n][i] = refValues[i][n]; - } - - // Load into Simd — each FloatSimd holds one stencil position across all W lanes - FloatSimd sv[19]; - for (int n = 0; n < 19; n++) - sv[n] = FloatSimd(inData[n], element_aligned); - - FloatSimd result = runSimdNormSqGrad(sv, dx, 0.f); - - printf("WenoNormSqGrad full 3-axis (W=%d, dx=%.2f):\n", W, dx); - bool allOk = true; - for (int i = 0; i < W; i++) { - float ref = refNormSqGrad(refValues[i], dx, 0.f); - float got = result[i]; - bool ok = std::abs(got - ref) < 1e-5f * std::abs(ref) + 1e-10f; - printf(" lane %2d: %12.6f ref: %12.6f %s\n", i, got, ref, ok ? "OK" : "FAIL"); - allOk &= ok; - } - printf("\nOverall: %s\n", allOk ? "PASS" : "FAIL"); - return allOk ? 0 : 1; -} From 2610a1187200bebf8d4d6f7e99bd4518f47f8b85 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Sat, 18 Apr 2026 12:21:11 -0500 Subject: [PATCH 30/60] cleanup: remove prefix-count sanity check from vbm_host_cuda_kernels.cu The inline prefix-scan / SWAR-popcount investigation block (lines 374-438) was an algorithm-exploration artifact that reimplemented decodeInverseMaps internals rather than testing the public API. Removes the block along with the now-unused popcount32 helper and the immintrin.h include. Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Efty Sifakis --- .../vbm_host_cuda_kernels.cu | 78 ------------------- 1 file changed, 78 deletions(-) diff --git a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/vbm_host_cuda_kernels.cu b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/vbm_host_cuda_kernels.cu index 9070bc38d9..91af736e9c 100644 --- a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/vbm_host_cuda_kernels.cu +++ b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/vbm_host_cuda_kernels.cu @@ -23,7 +23,6 @@ #include -#include #include #include @@ -38,16 +37,6 @@ static constexpr int BlockWidth = 1 << Log2BlockWidth; // 128 using VBM = nanovdb::tools::cuda::VoxelBlockManager; using CPUVBM = nanovdb::tools::VoxelBlockManager; -/// @brief Software 32-bit popcount (Hamming weight) via the AND/shift/add/multiply path. -/// Unlike hardware POPCNT (which is scalar), this compiles to VPMULLD under AVX2 and -/// vectorizes across all 16 lanes of a uint32_t vertical sweep over a valueMask. -inline uint32_t popcount32(uint32_t x) -{ - x = x - ((x >> 1) & 0x55555555u); - x = (x & 0x33333333u) + ((x >> 2) & 0x33333333u); - x = (x + (x >> 4)) & 0x0f0f0f0fu; - return (x * 0x01010101u) >> 24; -} /// @brief For each VBM block, decode the inverse map and store /// (leafIndex, voxelOffset) for every active voxel into global output arrays @@ -370,73 +359,6 @@ void runVBMCudaTest(const std::vector& coords) static constexpr int nPerfRuns = 20; - // --- Sanity check: countOn + std::fill only, no tree access --- - { - std::cout << "\nCPU decodeInverseMaps sanity (countOn only):\n"; - - // Never true: used only to prevent dead code elimination. - volatile uint32_t dummy = 0; - - for (int run = 0; run < nPerfRuns; ++run) { - nanovdb::util::Timer timer; - timer.start(""); - - nanovdb::util::forEach(0, nBlocks, 1, - [&](const nanovdb::util::Range1D& range) { - for (auto bID = range.begin(); bID < range.end(); ++bID) { - int nExtraLeaves = 0; - for (int i = 0; i < CPUVBM::JumpMapLength; i++) - nExtraLeaves += nanovdb::util::countOn( - jumpMapPtr[(uint64_t)bID * CPUVBM::JumpMapLength + i]); - - // Reinterpret the first leaf's 8 x uint64_t valueMask as - // 16 x uint32_t words, one per group of 32 consecutive voxels. - const auto& leaf = - h_grid->tree().getFirstNode<0>()[firstLeafIDPtr[bID]]; - const uint32_t* maskWords = - reinterpret_cast(leaf.valueMask().words()); - - // Phase 1: per-word inclusive prefix counts. - // prefixCountRealigned[step][lane] = popcount(maskWords[lane] & mask) - // where mask covers bits 0..step (inclusive). - // At step=31, mask=0xFFFFFFFF so row[31] == wordPopcount[lane]. - // Safe mask form: (uint32_t(2) << step) - 1u avoids UB at step=31. - alignas(32) uint32_t prefixCountRealigned[32][16]; - for (int step = 0; step < 32; step++) { - const uint32_t mask = (uint32_t(2) << step) - 1u; - #pragma omp simd - for (int lane = 0; lane < 16; lane++) - prefixCountRealigned[step][lane] = - popcount32(maskWords[lane] & mask); - } - - // Phase 2: exclusive prefix scan of row[31] -> baseOffset[lane], - // then add baseOffset to every row to get global prefix counts. - uint32_t baseOffset[16]; - baseOffset[0] = 0; - for (int lane = 1; lane < 16; lane++) - baseOffset[lane] = baseOffset[lane-1] + - prefixCountRealigned[31][lane-1]; - - for (int step = 0; step < 32; step++) { - #pragma omp simd - for (int lane = 0; lane < 16; lane++) - prefixCountRealigned[step][lane] += baseOffset[lane]; - } - - // Dummy: global prefix count at the last voxel equals the total - // active voxel count for this leaf, which is <= 512, never 513. - if (prefixCountRealigned[31][15] == 513u) - dummy = prefixCountRealigned[31][15]; - } - }); - - const float ms = - (float)timer.elapsed() / 1000.0f; - std::cout << " run " << run << ": " << ms << " ms\n"; - } - } - std::atomic dummy{0}; std::cout << "\nCPU decodeInverseMaps performance (" From 6fce798ee782eb76abddc993f5d82ef9a3976f55 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Sat, 18 Apr 2026 14:33:41 -0500 Subject: [PATCH 31/60] StencilAccessor: end-to-end perf analysis + Legacy scalar oracle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - New LegacyStencilAccessor.h: scalar ReadAccessor-based reference, templated on the same StencilT as StencilAccessor. Serves as a correctness oracle and performance baseline (mirrors the accessor path-eviction behaviour of OpenVDB's math/Stencils.h). - ex_stencil_gather_cpu: rewrite runPerf to multi-thread both paths via util::forEach, with an XOR-checksum anti-DCE, nanovdb::util::Timer wall-clock timing, and a decodeInverseMaps-only baseline pass. The decode step is ~3 ms of the ~127 ms end-to-end time (~2%), so the SIMD-vs-scalar comparison is dominated by the stencil-gather itself. - examples/CMakeLists.txt: bump ex_stencil_gather_cpu flags from -mavx2 to -march=native. - BatchAccessor.md §8h: full end-to-end codegen investigation on i9-285K Arrow Lake, 32 threads, 16.7 M active voxels. Default GCC 13 -O3 outlines 14 Simd.h helpers per cachedGetValue and outlines the 18 per-tap calls from calcTaps, producing ~323 calls and ~282 vzeroupper per 16-voxel batch — StencilAccessor runs at 7.5 ns/vox, losing to scalar LegacyStencilAccessor at 5.4 ns/vox. [[gnu::flatten]] on StencilAccessor::moveTo collapses the entire call tree into a single 77 KB inlined body: GCC drops to 3.7 ns/vox (2x), beating Clang's 4.3 ns/vox. Flattening only BatchAccessor::{prefetch,cachedGetValue} is insufficient (4.9 ns/vox). W=8 cuts YMM spills by 86% but regresses GCC end-to-end because per-batch framing overhead amortises over fewer lanes. Attributes not applied in the shipped headers — opt-in for StencilAccessor consumers that need peak GCC performance. - StencilAccessor.md §8.1: short GCC codegen note pointing to §8h. Signed-off-by: Efty Sifakis --- nanovdb/nanovdb/examples/CMakeLists.txt | 2 +- .../stencil_gather_cpu.cpp | 1061 ++++------------- nanovdb/nanovdb/util/BatchAccessor.md | 196 ++- nanovdb/nanovdb/util/LegacyStencilAccessor.h | 114 ++ nanovdb/nanovdb/util/StencilAccessor.md | 15 + 5 files changed, 557 insertions(+), 831 deletions(-) create mode 100644 nanovdb/nanovdb/util/LegacyStencilAccessor.h diff --git a/nanovdb/nanovdb/examples/CMakeLists.txt b/nanovdb/nanovdb/examples/CMakeLists.txt index 66f9365d78..5c35199e91 100644 --- a/nanovdb/nanovdb/examples/CMakeLists.txt +++ b/nanovdb/nanovdb/examples/CMakeLists.txt @@ -125,7 +125,7 @@ endif() # No CUDA required. Design in ex_voxelBlockManager_host_cuda/StencilGather.md. nanovdb_example(NAME "ex_stencil_gather_cpu") if(TARGET ex_stencil_gather_cpu) - target_compile_options(ex_stencil_gather_cpu PRIVATE -mavx2 -fopenmp-simd) + target_compile_options(ex_stencil_gather_cpu PRIVATE -march=native -fopenmp-simd) # simd_test/Simd.h lives three levels above this CMakeLists (at the repo root). target_include_directories(ex_stencil_gather_cpu PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../..) diff --git a/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp b/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp index 9d2644200c..582f3e6773 100644 --- a/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp +++ b/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp @@ -4,22 +4,25 @@ /*! \file stencil_gather_cpu.cpp - \brief Prototype for CPU SIMD stencil gather — Phase 1 only: - neighbor leaf resolution with lazy probeLeaf and per-leaf probedMask cache. - - Design documented in: - nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilGather.md - - What this prototype does (and does NOT do): - - Generates a random Morton-layout domain (same as vbm_host_cuda). - - Builds a ValueOnIndex NanoVDB grid and a VoxelBlockManager. - - For every block: calls decodeInverseMaps, then processes SIMD batches of - SIMDw=16 lanes, running the full probedMask / probeLeaf / batchPtrs population - pipeline described in StencilGather.md §8d–§8f. - - Does NOT call computeStencil. Instead, verifies that batchPtrs[4][SIMDw] - is correct for every active lane: for each of the 18 non-center WENO5 - stencil offsets, if the offset crosses a leaf boundary the corresponding - batchPtrs[axis+1][lane] is checked against a direct probeLeaf reference. + \brief CPU stencil gather: LegacyStencilAccessor vs StencilAccessor. + + Generates a random sparse domain, builds a ValueOnIndex NanoVDB grid and + a VoxelBlockManager, then runs two stencil-index gather paths side by side: + + LegacyStencilAccessor -- scalar, one voxel at a time, ReadAccessor-based. + Equivalent to OpenVDB's math/Stencils.h baseline: + path-cached tree walk per tap, per voxel. + The core comparison is the cost of path-cache + eviction: distant WENO5 taps (±3) evict the + center-leaf path, so each moveTo re-traverses + the tree multiple times per voxel. + + StencilAccessor -- SIMD batch, SIMDw=16 lanes, BatchAccessor-based. + Resolves neighbor leaves once per center-leaf run, + then accesses all taps via direct array indexing. + + runPrototype() cross-validates both paths (LegacyStencilAccessor is the oracle). + runPerf() measures moveTo throughput for each path (warm pass only, rdtsc). Build: Configured via CMakeLists.txt in the parent examples/ directory. @@ -34,19 +37,21 @@ #include #include #include -#include // SimdMask, Simd, any_of, none_of, to_bitmask -#include // BatchAccessor -#include // StencilAccessor, Weno5Stencil - -#include // __rdtsc, __rdtscp, _mm_lfence +#include +#include +#include +#include #include #include #include #include #include -#include +#include #include +#include +#include // std::accumulate (checksum) +#include // ============================================================ // Constants and type aliases @@ -54,37 +59,16 @@ static constexpr int Log2BlockWidth = 7; static constexpr int BlockWidth = 1 << Log2BlockWidth; // 128 -static constexpr int SIMDw = 16; // batch width -static constexpr int R = 3; // WENO5 stencil reach (±3) +static constexpr int SIMDw = 16; // StencilAccessor batch width using BuildT = nanovdb::ValueOnIndex; using GridT = nanovdb::NanoGrid; using LeafT = nanovdb::NanoLeaf; using CPUVBM = nanovdb::tools::VoxelBlockManager; -using AccT = nanovdb::DefaultReadAccessor; - -// Lane-predicate types: SIMDw-wide boolean mask and the uint32_t vector it compares. -using LeafIdxVec = nanovdb::util::Simd; -using LaneMask = nanovdb::util::SimdMask; - -// BatchAccessor instantiation for correctness cross-validation. -// ValueT = Simd because ValueOnIndex leaf values are uint64_t active indices. -using BAccT = nanovdb::BatchAccessor, // ValueT - nanovdb::util::Simd, // VoxelOffsetT - LaneMask>; // PredicateT - -// StencilAccessor instantiation for WENO5. -using SAccT = nanovdb::StencilAccessor; -// Return type of StencilAccessor::moveTo (mask over the uint64_t index domain). -using IndexMaskT = nanovdb::util::SimdMask; - -// Direction bit encoding shared across all stencil types: -// bit(dx, dy, dz) = (dx+1)*9 + (dy+1)*3 + (dz+1), dx,dy,dz ∈ {-1,0,+1} -// -// WENO5 face-neighbor bits (the only 6 bits ever set for WENO5): -static constexpr int kLoBit[3] = {4, 10, 12}; // x-lo, y-lo, z-lo -static constexpr int kHiBit[3] = {22, 16, 14}; // x-hi, y-hi, z-hi + +using SAccT = nanovdb::StencilAccessor; +using LegacyAccT = nanovdb::LegacyStencilAccessor; +using IndexMaskT = nanovdb::util::SimdMask; // ============================================================ // Test domain generation (mirrors vbm_host_cuda.cpp) @@ -125,535 +109,32 @@ generateDomain(int ambient_voxels, float occupancy, uint32_t seed = 42) return coords; } -// ============================================================ -// Neighbor direction utilities (§6a) -// ============================================================ - -/// @brief Return the origin of the neighbor leaf at direction bit d from center. -/// bit(dx,dy,dz) = (dx+1)*9 + (dy+1)*3 + (dz+1); leaf stride = 8 per axis. -static inline nanovdb::Coord neighborLeafOrigin(const nanovdb::Coord& center, int bit) -{ - const int dx = bit / 9 - 1; - const int dy = (bit / 3) % 3 - 1; - const int dz = bit % 3 - 1; - return center + nanovdb::Coord(dx * 8, dy * 8, dz * 8); -} - -/// @brief Extract the local axis coordinate from a voxelOffset. -/// NanoVDB leaf layout: offset = lx*64 + ly*8 + lz. -/// axis 0 (x): bits [8:6] → shift 6 -/// axis 1 (y): bits [5:3] → shift 3 -/// axis 2 (z): bits [2:0] → shift 0 -static inline int localAxisCoord(uint16_t vo, int axis) -{ - return (vo >> (6 - 3 * axis)) & 7; -} - -// ============================================================ -// Vectorized computeNeededDirs — shift-OR carry trick (§8e) -// -// Determines which of the 6 face-neighbor directions (±x, ±y, ±z) are required -// by any active lane in a SIMDw-wide batch. For WENO5 (R=3): -// plus-c neighbor needed iff any active lane has local coordinate lc ≥ 8−R = 5 -// minus-c neighbor needed iff any active lane has local coordinate lc ≤ R−1 = 2 -// -// Algorithm — "expand, add, reduce": -// 1. expandVoxelOffset(): pack lz, lx, ly (and second copies) into a 32-bit -// integer with 3-bit zero guards between groups. Each group occupies its -// own 3-bit field; the guards absorb carries so adjacent groups do not bleed. -// 2. Add kExpandCarryK to all SIMDw lane values simultaneously (one SIMD add). -// Groups 1–3 detect plus-directions; groups 4–6 detect minus-directions. -// 3. Horizontal OR across all lanes: carry bit SET → plus-direction needed. -// Horizontal AND across all lanes: carry bit CLEAR → minus-direction needed. -// (A minus-direction is needed when at least one lane has NO carry, i.e., -// lc < R; the AND bit is clear iff any lane failed to carry.) -// -// ============================================================ - -/// @brief voxelOffset sentinel for inactive / don't-care SIMD lanes. -/// -/// Any lane with laneMask[i] = false needs a voxelOffset value that: -/// • does NOT set carry bits 3, 9, 15 (would wrongly assert plus-directions), AND -/// • DOES set carry bits 19, 25, 31 (a clear bit would wrongly assert minus-directions). -/// -/// Local coordinate (4, 4, 4) satisfies both: R ≤ 4 < 8−R for R=3 (strictly interior). -/// voxelOffset(4,4,4) = 4*64 + 4*8 + 4 = 292 -/// group 1-3: 4 + 3 = 7 → no carry → bits 3, 9, 15 stay clear ✓ -/// group 4-6: 4 + 5 = 9 → carry → bits 19, 25, 31 stay set ✓ -/// -static constexpr uint16_t kInactiveVoxelOffset = (4u << 6) | (4u << 3) | 4u; // = 292 - -/// @brief Expand a 9-bit voxelOffset into a 32-bit "carry lane" layout. -/// -/// NanoVDB voxelOffset bit layout: [8:6] = lx, [5:3] = ly, [2:0] = lz. -/// -/// Target 32-bit layout — 6 groups of 3 bits with zero-guard separators: -/// -/// bits 0– 2 : lz ← group 1 (plus-z carry exits at bit 3) -/// bits 3– 5 : 0 (3-bit guard) -/// bits 6– 8 : lx ← group 2 (plus-x carry exits at bit 9) -/// bits 9–11 : 0 (3-bit guard) -/// bits 12–14 : ly ← group 3 (plus-y carry exits at bit 15) -/// bit 15 : 0 (1-bit guard — sufficient because max carry from a -/// 3-bit field added to a constant < 8 is exactly 1 bit; -/// at bit 15: input=0, addend=0, carry-in∈{0,1} → no further carry) -/// bits 16–18 : lz ← group 4 (minus-z carry exits at bit 19) -/// bits 19–21 : 0 (3-bit guard) -/// bits 22–24 : lx ← group 5 (minus-x carry exits at bit 25) -/// bits 25–27 : 0 (3-bit guard) -/// bits 28–30 : ly ← group 6 (minus-y carry exits at bit 31) -/// bit 31 : 0 (receives minus-y carry; bit 31 is within uint32_t range) -/// -/// Construction — three shift-OR steps, no multiply: -/// -/// Step 1: e |= (e << 9) → 0o xyzxyz (two 9-bit copies stacked, 18 bits) -/// Step 2: e &= 0x71C7 → keep lz@[0:2], lx@[6:8], ly@[12:14]; zero all others. -/// 0x71C7 = 0b 0111 0001 1100 0111 = 0o 070707 -/// Bits set: {0,1,2, 6,7,8, 12,13,14} -/// Step 3: e |= (e << 16) → copy the 15-bit pattern to bits [16:18],[22:24],[28:30] -/// -static inline constexpr uint32_t expandVoxelOffset(uint16_t vo) -{ - uint32_t e = vo; - e |= (e << 9); // step 1: two packed xyz copies at 9-bit stride - e &= 0x71C7u; // step 2: isolate lz@[0:2], lx@[6:8], ly@[12:14] with zero gaps - e |= (e << 16); // step 3: second copy to bits [16:18], [22:24], [28:30] - return e; -} - -/// @brief Combined carry-detection constant (added to expandVoxelOffset results). -/// -/// Groups 1–3 receive +R so a 3-bit field ≥ (8−R) produces a carry (plus-direction test). -/// Groups 4–6 receive +(8−R) so a 3-bit field ≥ R produces a carry (minus-direction test: -/// carry CLEAR ⟺ field < R ⟺ minus-direction needed). -/// -/// group1 group2 group3 group4 group5 group6 -/// K = R | R<<6 | R<<12 | (8-R)<<16 | (8-R)<<22 | (8-R)<<28 -/// = 3 | 192 | 12288 | 327680 | 20971520 | 1342177280 -/// = 1,363,488,963 (0x514530C3) — fits in uint32_t (< 2^32). -/// -/// Carry bits produced by expanded + K: -/// bit 3 set ↔ lz ≥ 8−R → plus-z needed -/// bit 9 set ↔ lx ≥ 8−R → plus-x needed -/// bit 15 set ↔ ly ≥ 8−R → plus-y needed -/// bit 19 clear ↔ lz < R → minus-z needed -/// bit 25 clear ↔ lx < R → minus-x needed -/// bit 31 clear ↔ ly < R → minus-y needed -/// -static constexpr uint32_t kExpandCarryK = - ((uint32_t)R ) | // bits 0– 2: +R → lz plus-z group - ((uint32_t)R << 6) | // bits 6– 8: +R → lx plus-x group - ((uint32_t)R << 12) | // bits 12–14: +R → ly plus-y group - ((uint32_t)(8-R) << 16) | // bits 16–18: +5 → lz minus-z group - ((uint32_t)(8-R) << 22) | // bits 22–24: +5 → lx minus-x group - ((uint32_t)(8-R) << 28); // bits 28–30: +5 → ly minus-y group - -/// @brief Pre-expanded sentinel value for inactive / straddle SIMD lanes. -/// -/// Caller broadcasts this to all lanes before overwriting the leafMask lanes -/// with the real expandVoxelOffset() values. Equivalent to -/// expandVoxelOffset(kInactiveVoxelOffset) -/// which, at compile time, is 0x41044104. -static constexpr uint32_t kSentinelExpanded = expandVoxelOffset(kInactiveVoxelOffset); - -/// @brief Scalar reference implementation (lane-by-lane loop). -/// Kept alongside the SIMD version so debug builds can cross-check. -static uint32_t computeNeededDirsScalar(const uint16_t* voxelOffset, - int batchStart, - LaneMask laneMask) -{ - uint32_t needed = 0; - for (int i = 0; i < SIMDw; i++) { - if (!laneMask[i]) continue; - const uint16_t vo = voxelOffset[batchStart + i]; - for (int axis = 0; axis < 3; axis++) { - const int lc = localAxisCoord(vo, axis); - if (lc < R) needed |= (1u << kLoBit[axis]); - if (lc >= 8-R) needed |= (1u << kHiBit[axis]); - } - } - return needed; -} - -/// @brief Vectorized computeNeededDirs — shift-OR carry trick. -/// -/// Returns the 27-bit probedMask subset identifying which of the 6 WENO5 -/// face-neighbor directions are required by any active lane. -/// -/// For WENO5 (R=3) only the 6 face-direction bits {4,10,12,14,16,22} can -/// ever be set; the 21 edge/corner bits remain zero. -/// -/// @param expandedVec SIMDw pre-expanded voxelOffset values (see expandVoxelOffset). -/// Caller is responsible for: -/// • Broadcasting kSentinelExpanded to all lanes first. -/// • Overwriting leafMask lanes with expandVoxelOffset(voxelOffset[...]). -/// This keeps sentinel / masking logic at the single gather site where leafMask -/// is known, not buried inside this function. -/// -/// High-level flow: -/// 1. Single SIMD add of kExpandCarryK (caller already expanded each lane). -/// 2. Horizontal OR of all results → carry SET = plus-direction needed. -/// Horizontal AND of all results → carry CLEAR = minus-direction needed. -/// 3. Map carry bits to the 27-bit probedMask encoding. -/// -static uint32_t computeNeededDirs(nanovdb::util::Simd expandedVec) -{ - using VecU32 = nanovdb::util::Simd; - - // --- Single SIMD add -------------------------------------------------- - // Inject carry-detection thresholds for all 6 groups simultaneously. - // After this add, each lane's result[i] encodes all six direction tests - // as carry bits at positions 3, 9, 15 (plus) and 19, 25, 31 (minus). - const VecU32 result = expandedVec + VecU32(kExpandCarryK); - - // --- Horizontal reductions -------------------------------------------- - // - // hor_or: bit k is set iff at least one lane has bit k set in result. - // → Check carry bits 3 (z), 9 (x), 15 (y): SET means plus-direction needed. - // - // hor_and: bit k is set iff every lane has bit k set in result. - // → Check carry bits 19 (z), 25 (x), 31 (y): CLEAR means minus-direction - // needed (at least one lane did not carry, i.e., its coordinate < R). - // - uint32_t hor_or = 0u, hor_and = ~0u; - for (int i = 0; i < SIMDw; i++) { - hor_or |= result[i]; - hor_and &= result[i]; - } - - // --- Map carry bits → probedMask direction bits ----------------------- - // - // Plus carries (bits 3, 9, 15) set → kHiBit (hi-side neighbor needed). - // Minus carries (bits 19, 25, 31) clear → kLoBit (lo-side neighbor needed). - // - // carry bit | axis | condition | probedMask bit - // ----------+------+------------+--------------- - // 3 | z | lz ≥ 8−R | kHiBit[2] = 14 - // 9 | x | lx ≥ 8−R | kHiBit[0] = 22 - // 15 | y | ly ≥ 8−R | kHiBit[1] = 16 - // 19 clr | z | lz < R | kLoBit[2] = 12 - // 25 clr | x | lx < R | kLoBit[0] = 4 - // 31 clr | y | ly < R | kLoBit[1] = 10 - // - uint32_t needed = 0; - if ( hor_or & (1u << 3)) needed |= (1u << kHiBit[2]); // plus-z - if ( hor_or & (1u << 9)) needed |= (1u << kHiBit[0]); // plus-x - if ( hor_or & (1u << 15)) needed |= (1u << kHiBit[1]); // plus-y - if (!(hor_and & (1u << 19))) needed |= (1u << kLoBit[2]); // minus-z - if (!(hor_and & (1u << 25))) needed |= (1u << kLoBit[0]); // minus-x - if (!(hor_and & (1u << 31))) needed |= (1u << kLoBit[1]); // minus-y - - return needed; -} - -// ============================================================ -// Targeted sentinel correctness test (§8e supplement) -// -// Verifies that inactive lanes — including straddle lanes that ARE active -// voxels but belong to a different leaf — do not inject spurious direction -// bits into the SIMD result. -// -// The test is designed so that a broken sentinel (i.e., using the straddle -// lane's real voxelOffset instead of kInactiveVoxelOffset) would produce a -// DIFFERENT result from the scalar reference in BOTH the plus and minus -// directions, making the bug impossible to miss. -// -// Layout (SIMDw = 16 lanes): -// leafMask lanes (even: 0,2,4,...,14): -// lx=4 (neutral for x), ly=4 (neutral for y), lz=6 (→ plus-z needed) -// voxelOffset = 4*64 + 4*8 + 6 = 294 -// -// straddle lanes (odd: 1,3,5,...,15) — active voxels, wrong leaf: -// lx=0 (→ minus-x if used), ly=7 (→ plus-y if used), lz=1 (→ minus-z if used) -// voxelOffset = 0*64 + 7*8 + 1 = 57 -// -// Expected result (scalar — straddle lanes ignored): -// plus-z needed (bit kHiBit[2]=14): lz=6 ≥ 5 in leafMask lanes ✓ -// minus-x NOT needed: lx=4 ≥ R=3 for all leafMask lanes ✓ -// plus-y NOT needed: ly=4 < 8-R=5 for all leafMask lanes ✓ -// minus-z NOT needed: lz=6 ≥ R=3 for all leafMask lanes ✓ -// plus-x NOT needed: lx=4 < 8-R=5 for all leafMask lanes ✓ -// minus-y NOT needed: ly=4 ≥ R=3 for all leafMask lanes ✓ -// -// If sentinel fails: straddle lx=0 → minus-x spuriously added; -// straddle ly=7 → plus-y spuriously added; -// straddle lz=1 → minus-z spuriously added. -// Those discrepancies are caught by the scalar cross-check inside -// computeNeededDirs, which will abort immediately. -// ============================================================ - -static void verifyComputeNeededDirsSentinel() -{ - // --- Sentinel property: expandVoxelOffset(292) + K must have --- - // --- plus-carry bits {3,9,15} clear and minus-carry bits {19,25,31} set --- - { - const uint32_t expanded = expandVoxelOffset(kInactiveVoxelOffset); - const uint32_t result = expanded + kExpandCarryK; - const bool plus_ok = !(result & ((1u<<3)|(1u<<9)|(1u<<15))); - const bool minus_ok = (result & ((1u<<19)|(1u<<25)|(1u<<31))) == - ((1u<<19)|(1u<<25)|(1u<<31)); - if (!plus_ok || !minus_ok) { - std::cerr << "verifyComputeNeededDirsSentinel: sentinel carry property violated" - << " expanded=0x" << std::hex << expanded - << " result=0x" << result << std::dec << "\n"; - std::abort(); - } - } - - // --- Straddle scenario: straddle lanes must not pollute the result --- - alignas(64) uint16_t voxelOffset[BlockWidth] = {}; - - // leafMask lanes (even): lx=4, ly=4, lz=6 → voxelOffset = 4*64+4*8+6 = 294 - // straddle lanes (odd): lx=0, ly=7, lz=1 → voxelOffset = 0*64+7*8+1 = 57 - LaneMask laneMask; - for (int i = 0; i < SIMDw; i++) { - const bool active = (i % 2 == 0); - laneMask[i] = active; - voxelOffset[i] = active ? uint16_t(294) : uint16_t(57); - } - - // Expected: only plus-z (kHiBit[2] = 14) should be set. - // - // Build the pre-expanded vector exactly as the gather site would. - using VecU32 = nanovdb::util::Simd; - VecU32 expandedVec(kSentinelExpanded); - for (int i = 0; i < SIMDw; i++) { - if (laneMask[i]) expandedVec[i] = expandVoxelOffset(voxelOffset[i]); - } - const uint32_t result = computeNeededDirs(expandedVec); - - // Explicit cross-check: scalar reference (SIMD cross-check no longer lives inside - // computeNeededDirs — it is the caller's responsibility at each gather site). - { - const uint32_t ref = computeNeededDirsScalar(voxelOffset, 0, laneMask); - if (result != ref) { - std::cerr << "verifyComputeNeededDirsSentinel: SIMD/scalar mismatch" - << " simd=0x" << std::hex << result - << " ref=0x" << ref << std::dec << "\n"; - std::abort(); - } - } - - const uint32_t expected = (1u << kHiBit[2]); // plus-z only - - if (result != expected) { - std::cerr << "verifyComputeNeededDirsSentinel: wrong direction mask" - << " got=0x" << std::hex << result - << " expected=0x" << expected << std::dec << "\n"; - std::abort(); - } - - std::cout << "verifyComputeNeededDirsSentinel: PASSED\n"; -} - // ============================================================ // Verification // ============================================================ struct VerifyStats { - uint64_t laneChecks = 0; // stencil-point/lane combinations inspected - uint64_t errors = 0; -}; - -// 18 non-center WENO5 stencil offsets {axis, delta}. -// Each point moves strictly along one axis (axis-aligned stencil). -static constexpr int kWeno5Offsets[18][2] = { - {0,-3},{0,-2},{0,-1},{0,+1},{0,+2},{0,+3}, // x-axis - {1,-3},{1,-2},{1,-1},{1,+1},{1,+2},{1,+3}, // y-axis - {2,-3},{2,-2},{2,-1},{2,+1},{2,+2},{2,+3}, // z-axis + uint64_t laneChecks = 0; + uint64_t errors = 0; }; -/// @brief For every active lane (set in laneMask), walk the 18 non-center WENO5 -/// stencil offsets. For offsets that cross a leaf boundary, confirm that -/// batchPtrs[axis+1][lane] matches a direct probeLeaf reference. -/// Also confirms that batchPtrs[0][lane] == &firstLeaf[leafIndex[batchStart+lane]]. -static void verifyBatchPtrs( - const LeafT* const (&batchPtrs)[4][SIMDw], - const LeafT* firstLeaf, - const uint32_t* leafIndex, - const uint16_t* voxelOffset, - int batchStart, - LaneMask laneMask, - AccT& refAcc, - VerifyStats& stats) -{ - for (int i = 0; i < SIMDw; i++) { - if (!laneMask[i]) continue; - const int p = batchStart + i; - - const LeafT* centerLeaf = &firstLeaf[leafIndex[p]]; - const nanovdb::Coord cOrig = centerLeaf->origin(); - const uint16_t vo = voxelOffset[p]; - - // Center slot must always point to the center leaf. - stats.laneChecks++; - if (batchPtrs[0][i] != centerLeaf) { - ++stats.errors; - if (stats.errors <= 10) - std::cerr << "CENTER MISMATCH lane=" << i << "\n"; - } - - // Walk each stencil offset. - for (const auto& off : kWeno5Offsets) { - const int axis = off[0]; - const int delta = off[1]; - const int lc = localAxisCoord(vo, axis); - - const bool crossesLo = (lc + delta < 0); - const bool crossesHi = (lc + delta >= 8); - if (!crossesLo && !crossesHi) continue; // stays in center leaf - - // Expected: probe the adjacent leaf in the crossing direction. - const int dirBit = crossesLo ? kLoBit[axis] : kHiBit[axis]; - const nanovdb::Coord nOrig = neighborLeafOrigin(cOrig, dirBit); - const LeafT* expected = refAcc.probeLeaf(nOrig); - const LeafT* actual = batchPtrs[1 + axis][i]; - - stats.laneChecks++; - if (actual != expected) { - ++stats.errors; - if (stats.errors <= 10) { - std::cerr << "MISMATCH: lane=" << i - << " axis=" << axis << " delta=" << delta - << " lc=" << lc - << " expected=" << static_cast(expected) - << " actual=" << static_cast(actual) << "\n"; - } - } - } - } -} - -// ============================================================ -// BatchAccessor correctness verification -// -// checkOneTap: calls batchAcc.cachedGetValue for stencil tap (di,dj,dk), -// then for each active lane compares the result against a direct tree reference. -// -// Assumes the caller has already issued the 6 WENO5 extremal prefetches so that -// all directions reachable by ±3 along any axis are in mProbedMask. -// ============================================================ - -template -static void checkOneTap( - const BAccT& batchAcc, - nanovdb::util::Simd voVec, - LaneMask leafMask, - nanovdb::Coord centerLeafOrigin, - const LeafT* firstLeaf, - uint32_t currentLeafID, - const uint16_t* voxelOffset, - int batchStart, - AccT& refAcc, - VerifyStats& stats) -{ - nanovdb::util::Simd tapResult(uint64_t(0)); - batchAcc.cachedGetValue(tapResult, voVec, leafMask); - - for (int i = 0; i < SIMDw; ++i) { - if (!leafMask[i]) continue; - ++stats.laneChecks; - - const uint16_t vo_i = voxelOffset[batchStart + i]; - const int lx = (vo_i >> 6) & 7; - const int ly = (vo_i >> 3) & 7; - const int lz = vo_i & 7; - const int nx = lx + di, ny = ly + dj, nz = lz + dk; - const int dx = (nx < 0) ? -1 : (nx >= 8) ? 1 : 0; - const int dy = (ny < 0) ? -1 : (ny >= 8) ? 1 : 0; - const int dz = (nz < 0) ? -1 : (nz >= 8) ? 1 : 0; - const int nx_w = nx - dx * 8; - const int ny_w = ny - dy * 8; - const int nz_w = nz - dz * 8; - const uint32_t offset = uint32_t(nx_w) * 64u + uint32_t(ny_w) * 8u + uint32_t(nz_w); - - const LeafT* refLeaf; - if (dx == 0 && dy == 0 && dz == 0) { - refLeaf = &firstLeaf[currentLeafID]; - } else { - refLeaf = refAcc.probeLeaf( - centerLeafOrigin + nanovdb::Coord(dx * 8, dy * 8, dz * 8)); - } - - const uint64_t expected = refLeaf - ? static_cast(refLeaf->getValue(offset)) - : uint64_t(0); - const uint64_t actual = static_cast(tapResult[i]); - - if (actual != expected) { - ++stats.errors; - if (stats.errors <= 10) { - std::cerr << "BATCHACC MISMATCH" - << " tap=(" << di << "," << dj << "," << dk << ")" - << " lane=" << i - << " expected=" << expected - << " actual=" << actual << "\n"; - } - } - } -} - -/// @brief Cross-validate BatchAccessor::cachedGetValue for all 18 WENO5 non-center taps. -/// Requires the 6 extremal prefetches to have been called first. -static void verifyBatchAccessor( - const BAccT& batchAcc, - nanovdb::util::Simd voVec, - LaneMask leafMask, - nanovdb::Coord centerLeafOrigin, - const LeafT* firstLeaf, - uint32_t currentLeafID, - const uint16_t* voxelOffset, - int batchStart, - AccT& refAcc, - VerifyStats& stats) -{ - // x-axis taps (di in {-3,-2,-1,+1,+2,+3}) - checkOneTap<-3, 0, 0>(batchAcc, voVec, leafMask, centerLeafOrigin, firstLeaf, currentLeafID, voxelOffset, batchStart, refAcc, stats); - checkOneTap<-2, 0, 0>(batchAcc, voVec, leafMask, centerLeafOrigin, firstLeaf, currentLeafID, voxelOffset, batchStart, refAcc, stats); - checkOneTap<-1, 0, 0>(batchAcc, voVec, leafMask, centerLeafOrigin, firstLeaf, currentLeafID, voxelOffset, batchStart, refAcc, stats); - checkOneTap<+1, 0, 0>(batchAcc, voVec, leafMask, centerLeafOrigin, firstLeaf, currentLeafID, voxelOffset, batchStart, refAcc, stats); - checkOneTap<+2, 0, 0>(batchAcc, voVec, leafMask, centerLeafOrigin, firstLeaf, currentLeafID, voxelOffset, batchStart, refAcc, stats); - checkOneTap<+3, 0, 0>(batchAcc, voVec, leafMask, centerLeafOrigin, firstLeaf, currentLeafID, voxelOffset, batchStart, refAcc, stats); - // y-axis taps - checkOneTap< 0,-3, 0>(batchAcc, voVec, leafMask, centerLeafOrigin, firstLeaf, currentLeafID, voxelOffset, batchStart, refAcc, stats); - checkOneTap< 0,-2, 0>(batchAcc, voVec, leafMask, centerLeafOrigin, firstLeaf, currentLeafID, voxelOffset, batchStart, refAcc, stats); - checkOneTap< 0,-1, 0>(batchAcc, voVec, leafMask, centerLeafOrigin, firstLeaf, currentLeafID, voxelOffset, batchStart, refAcc, stats); - checkOneTap< 0,+1, 0>(batchAcc, voVec, leafMask, centerLeafOrigin, firstLeaf, currentLeafID, voxelOffset, batchStart, refAcc, stats); - checkOneTap< 0,+2, 0>(batchAcc, voVec, leafMask, centerLeafOrigin, firstLeaf, currentLeafID, voxelOffset, batchStart, refAcc, stats); - checkOneTap< 0,+3, 0>(batchAcc, voVec, leafMask, centerLeafOrigin, firstLeaf, currentLeafID, voxelOffset, batchStart, refAcc, stats); - // z-axis taps - checkOneTap< 0, 0,-3>(batchAcc, voVec, leafMask, centerLeafOrigin, firstLeaf, currentLeafID, voxelOffset, batchStart, refAcc, stats); - checkOneTap< 0, 0,-2>(batchAcc, voVec, leafMask, centerLeafOrigin, firstLeaf, currentLeafID, voxelOffset, batchStart, refAcc, stats); - checkOneTap< 0, 0,-1>(batchAcc, voVec, leafMask, centerLeafOrigin, firstLeaf, currentLeafID, voxelOffset, batchStart, refAcc, stats); - checkOneTap< 0, 0,+1>(batchAcc, voVec, leafMask, centerLeafOrigin, firstLeaf, currentLeafID, voxelOffset, batchStart, refAcc, stats); - checkOneTap< 0, 0,+2>(batchAcc, voVec, leafMask, centerLeafOrigin, firstLeaf, currentLeafID, voxelOffset, batchStart, refAcc, stats); - checkOneTap< 0, 0,+3>(batchAcc, voVec, leafMask, centerLeafOrigin, firstLeaf, currentLeafID, voxelOffset, batchStart, refAcc, stats); -} - -// ============================================================ -// StencilAccessor correctness verification -// -// For every active lane (set in activeMask returned by moveTo): -// - Reconstruct the global coordinate from (leafIndex, voxelOffset). -// - For each of the 18 WENO5 taps, add the tap offset, decompose into -// leaf-local coordinates, probe the neighbor leaf, and compare -// stencilAcc[k][lane] against refLeaf->getValue(localOffset). -// -// For every inactive lane: -// - Assert that all tap slots hold 0 (the NanoVDB background index). -// ============================================================ - +/// Cross-validate one StencilAccessor batch against LegacyStencilAccessor. +/// +/// Active lanes: reconstruct the global coordinate from (leafIndex, voxelOffset), +/// call legacyAcc.moveTo(), and compare all SIZE tap indices element-by-element. +/// +/// Inactive lanes: assert all tap slots in stencilAcc hold 0 (background index). static void verifyStencilAccessor( const SAccT& stencilAcc, - IndexMaskT activeMask, // returned by stencilAcc.moveTo() + IndexMaskT activeMask, const uint32_t* leafIndex, const uint16_t* voxelOffset, int batchStart, const LeafT* firstLeaf, - AccT& refAcc, + LegacyAccT& legacyAcc, VerifyStats& stats) { - // Check inactive lanes: all tap slots must hold 0 (background index). + // Inactive lanes: all tap slots must hold 0 (NanoVDB background index). for (int i = 0; i < SIMDw; ++i) { if (activeMask[i]) continue; for (int k = 0; k < stencilAcc.size(); ++k) { @@ -669,58 +150,27 @@ static void verifyStencilAccessor( } } - // Check active lanes against the scalar tree reference. + // Active lanes: compare against the LegacyStencilAccessor oracle. for (int i = 0; i < SIMDw; ++i) { if (!activeMask[i]) continue; - const int p = batchStart + i; - const uint16_t vo = voxelOffset[p]; - const uint32_t li = leafIndex[p]; - const nanovdb::Coord cOrigin = firstLeaf[li].origin(); - - // Center voxel local coordinates within the leaf. - const int lx = (vo >> 6) & 7; - const int ly = (vo >> 3) & 7; - const int lz = vo & 7; - - for (int k = 0; k < 18; ++k) { - const int axis = kWeno5Offsets[k][0]; - const int delta = kWeno5Offsets[k][1]; - const int di = (axis == 0) ? delta : 0; - const int dj = (axis == 1) ? delta : 0; - const int dk = (axis == 2) ? delta : 0; - - // Tap destination in leaf-local space (may be outside [0,7]). - const int nx = lx + di, ny = ly + dj, nz = lz + dk; - - // Leaf-crossing step (−1, 0, or +1 per axis). - const int dx = (nx < 0) ? -1 : (nx >= 8) ? 1 : 0; - const int dy = (ny < 0) ? -1 : (ny >= 8) ? 1 : 0; - const int dz = (nz < 0) ? -1 : (nz >= 8) ? 1 : 0; - - // Wrapped local coordinates within the target leaf. - const int nx_w = nx - dx * 8; - const int ny_w = ny - dy * 8; - const int nz_w = nz - dz * 8; - const uint32_t offset = uint32_t(nx_w) * 64u + uint32_t(ny_w) * 8u + uint32_t(nz_w); - - // Reference: probe the target leaf and read its value. - const LeafT* refLeaf = (dx == 0 && dy == 0 && dz == 0) - ? &firstLeaf[li] - : refAcc.probeLeaf(cOrigin + nanovdb::Coord(dx * 8, dy * 8, dz * 8)); - - const uint64_t expected = refLeaf - ? static_cast(refLeaf->getValue(offset)) - : uint64_t(0); - const uint64_t actual = static_cast(stencilAcc[k][i]); + const int p = batchStart + i; + const uint16_t vo = voxelOffset[p]; + const uint32_t li = leafIndex[p]; + const nanovdb::Coord cOrigin = firstLeaf[li].origin(); + const int lx = (vo >> 6) & 7, ly = (vo >> 3) & 7, lz = vo & 7; + + legacyAcc.moveTo(cOrigin + nanovdb::Coord(lx, ly, lz)); + for (int k = 0; k < stencilAcc.size(); ++k) { ++stats.laneChecks; + const uint64_t expected = legacyAcc[k]; + const uint64_t actual = static_cast(stencilAcc[k][i]); if (actual != expected) { ++stats.errors; if (stats.errors <= 10) std::cerr << "STENCIL MISMATCH" - << " tap=(" << di << "," << dj << "," << dk << ")" - << " slot=" << k + << " tap=" << k << " lane=" << i << " expected=" << expected << " actual=" << actual << "\n"; @@ -730,190 +180,56 @@ static void verifyStencilAccessor( } // ============================================================ -// Main prototype: Phase 1 (neighbor leaf resolution) + verification +// Correctness run: cross-validate StencilAccessor vs LegacyStencilAccessor // ============================================================ -static void runPrototype(const GridT* grid, - const nanovdb::tools::VoxelBlockManagerHandle& vbmHandle) +static void runPrototype( + const GridT* grid, + const nanovdb::tools::VoxelBlockManagerHandle& vbmHandle) { - const auto& tree = grid->tree(); - const LeafT* firstLeaf = tree.getFirstNode<0>(); + const LeafT* firstLeaf = grid->tree().getFirstNode<0>(); const uint64_t nVoxels = grid->activeVoxelCount(); const uint32_t nBlocks = (uint32_t)vbmHandle.blockCount(); const uint32_t* firstLeafID = vbmHandle.hostFirstLeafID(); const uint64_t* jumpMap = vbmHandle.hostJumpMap(); - // One ReadAccessor per thread, reused across all blocks (§8c). - AccT acc = grid->getAccessor(); - - // Block-local scratch (stack-resident, stays in L1 across batches). alignas(64) uint32_t leafIndex[BlockWidth]; alignas(64) uint16_t voxelOffset[BlockWidth]; + // LegacyStencilAccessor owns its ReadAccessor; one instance per thread. + LegacyAccT legacyAcc(*grid); VerifyStats stats; - uint64_t nStraddles = 0; - for (uint32_t bID = 0; bID < nBlocks; bID++) { + for (uint32_t bID = 0; bID < nBlocks; ++bID) { const uint64_t blockFirstOffset = vbmHandle.firstOffset() + (uint64_t)bID * BlockWidth; - // Decode inverse maps. CPUVBM::decodeInverseMaps( - grid, - firstLeafID[bID], + grid, firstLeafID[bID], &jumpMap[(uint64_t)bID * CPUVBM::JumpMapLength], - blockFirstOffset, - leafIndex, - voxelOffset); + blockFirstOffset, leafIndex, voxelOffset); - // Recompute nLeaves from jumpMap; avoids modifying decodeInverseMaps API - // and keeps CPU/CUDA API symmetric (§9). int nExtraLeaves = 0; - for (int w = 0; w < CPUVBM::JumpMapLength; w++) + for (int w = 0; w < CPUVBM::JumpMapLength; ++w) nExtraLeaves += nanovdb::util::countOn( jumpMap[(uint64_t)bID * CPUVBM::JumpMapLength + w]); - // Block-level neighbor-leaf resolution state (§8d, §8f). - uint32_t currentLeafID = firstLeafID[bID]; - uint32_t probedMask = 0; - const LeafT* ptrs[27] = {}; - nanovdb::Coord centerLeafCoord = firstLeaf[currentLeafID].origin(); - - // BatchAccessor: alternate execution path for correctness cross-validation. - BAccT batchAcc(*grid, currentLeafID); - - // StencilAccessor: constructed once per block, persists across batches. SAccT stencilAcc(*grid, firstLeafID[bID], (uint32_t)nExtraLeaves); - // Process SIMD batches. for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { - - // Load the SIMDw leafIndex values for this batch once; reused below. - const LeafIdxVec leafSlice(&leafIndex[batchStart], nanovdb::util::element_aligned); - - // Active-lane mask: lanes with a valid (non-sentinel) leafIndex. - LaneMask activeMask = (leafSlice != LeafIdxVec(CPUVBM::UnusedLeafIndex)); - if (nanovdb::util::none_of(activeMask)) continue; - - // StencilAccessor: gather all 18 WENO5 tap indices for this batch. - // moveTo owns the straddling loop internally; call once per batch. - { - const IndexMaskT sActive = - stencilAcc.moveTo(leafIndex + batchStart, voxelOffset + batchStart); - verifyStencilAccessor(stencilAcc, sActive, leafIndex, voxelOffset, - batchStart, firstLeaf, acc, stats); - } - - // Track straddle batches for diagnostic output. - for (int i = 0; i < SIMDw; i++) { - if (activeMask[i] && leafIndex[batchStart + i] != currentLeafID) { - nStraddles++; - break; - } - } - - // Inner loop: consume one center leaf's worth of lanes per iteration. - while (nanovdb::util::any_of(activeMask)) { - // Identify lanes belonging to currentLeafID. - LaneMask leafMask = activeMask & (leafSlice == LeafIdxVec(currentLeafID)); - - if (nanovdb::util::none_of(leafMask)) { - // No lanes for currentLeafID: advance to next leaf. - assert(currentLeafID < firstLeafID[bID] + (uint32_t)nExtraLeaves); - currentLeafID++; - probedMask = 0; - centerLeafCoord = firstLeaf[currentLeafID].origin(); - batchAcc.advance(currentLeafID); - continue; - } - - // --- Phase 1: probe newly needed neighbor leaves (§8d) --- - // - // Build the pre-expanded vector at the gather site — the only - // place where leafMask is known. Broadcast the sentinel first - // (straddle / inactive lanes stay neutral), then overwrite the - // leafMask lanes with their actual expandVoxelOffset values. - using VecU32 = nanovdb::util::Simd; - VecU32 expandedVec(kSentinelExpanded); - for (int i = 0; i < SIMDw; i++) { - if (leafMask[i]) - expandedVec[i] = expandVoxelOffset(voxelOffset[batchStart + i]); - } - const uint32_t neededMask = computeNeededDirs(expandedVec); - - // Cross-check against scalar reference (always-on; overhead is - // ~18 scalar ops per batch, negligible vs. the probeLeaf calls). - { - const uint32_t ref = computeNeededDirsScalar(voxelOffset, batchStart, leafMask); - if (neededMask != ref) { - std::cerr << "computeNeededDirs: SIMD/scalar mismatch" - << " simd=0x" << std::hex << neededMask - << " ref=0x" << ref << std::dec << "\n"; - std::abort(); - } - } - - uint32_t toProbe = neededMask & ~probedMask; - - while (toProbe) { - const int d = __builtin_ctz(toProbe); - ptrs[d] = acc.probeLeaf(neighborLeafOrigin(centerLeafCoord, d)); - probedMask |= (1u << d); - toProbe &= toProbe - 1; - } - - // --- Phase 2: populate per-lane batchPtrs[4][SIMDw] (§6c) --- - // batchPtrs[0][i] = center leaf - // batchPtrs[1][i] = x-axis neighbor (lo, hi, or nullptr) - // batchPtrs[2][i] = y-axis neighbor - // batchPtrs[3][i] = z-axis neighbor - const LeafT* batchPtrs[4][SIMDw] = {}; - for (int i = 0; i < SIMDw; i++) { - if (!leafMask[i]) continue; - batchPtrs[0][i] = &firstLeaf[currentLeafID]; - for (int axis = 0; axis < 3; axis++) { - const int lc = localAxisCoord(voxelOffset[batchStart + i], axis); - if (lc < R) - batchPtrs[1 + axis][i] = ptrs[kLoBit[axis]]; - else if (lc >= 8-R) - batchPtrs[1 + axis][i] = ptrs[kHiBit[axis]]; - // else: nullptr (interior lane for this axis) - } - } - - // --- Verification (Phase 1 pointer check) --- - verifyBatchPtrs(batchPtrs, firstLeaf, leafIndex, voxelOffset, - batchStart, leafMask, acc, stats); - - // --- BatchAccessor alternate path + cross-validation --- - // - // 6 extremal WENO5 prefetches cover all face-neighbor directions. - // The center direction (dir(0,0,0)) is guaranteed populated by at - // least one of these calls (see BatchAccessor.md §5). - using VoVecT = nanovdb::util::Simd; - const VoVecT voVec(&voxelOffset[batchStart], nanovdb::util::element_aligned); - batchAcc.prefetch<-3, 0, 0>(voVec, leafMask); - batchAcc.prefetch<+3, 0, 0>(voVec, leafMask); - batchAcc.prefetch< 0, -3, 0>(voVec, leafMask); - batchAcc.prefetch< 0, +3, 0>(voVec, leafMask); - batchAcc.prefetch< 0, 0, -3>(voVec, leafMask); - batchAcc.prefetch< 0, 0, +3>(voVec, leafMask); - - verifyBatchAccessor(batchAcc, voVec, leafMask, centerLeafCoord, - firstLeaf, currentLeafID, voxelOffset, - batchStart, acc, stats); - - activeMask = activeMask & !leafMask; - } + const IndexMaskT active = + stencilAcc.moveTo(leafIndex + batchStart, voxelOffset + batchStart); + verifyStencilAccessor(stencilAcc, active, + leafIndex, voxelOffset, batchStart, + firstLeaf, legacyAcc, stats); } } - std::cout << "Prototype (Phase 1 + BatchAccessor verification):\n" - << " blocks = " << nBlocks << "\n" - << " voxels = " << nVoxels << "\n" - << " straddles = " << nStraddles << "\n" - << " laneChecks = " << stats.laneChecks << "\n"; + std::cout << "Correctness (StencilAccessor vs LegacyStencilAccessor):\n" + << " blocks = " << nBlocks << "\n" + << " voxels = " << nVoxels << "\n" + << " laneChecks = " << stats.laneChecks << "\n"; if (stats.errors == 0) std::cout << " PASSED\n"; @@ -922,76 +238,173 @@ static void runPrototype(const GridT* } // ============================================================ -// Performance measurement: StencilAccessor::moveTo throughput +// End-to-end performance comparison (multithreaded) +// +// Both paths run the full pipeline inside util::forEach: +// decodeInverseMaps → coord extraction → stencil gather → sum → store // -// Two-pass design: first pass warms instruction cache, branch predictor, -// and the leaf data accessed by advance()/prefetch(). Second pass is timed. -// decodeInverseMaps is outside the rdtsc fence — we measure moveTo only. +// decodeInverseMaps is deliberately included: its cost is identical for +// both paths (pure cancellation in the comparison) and including it avoids +// fine-grained intra-block timing artifacts. // -// Reports TSC ticks/batch and TSC ticks/voxel (using BlockWidth as denominator; -// slightly over-counts inactive padding slots but is stable across runs). -// TSC ticks ≈ ns × (nominal_GHz); divide by actual turbo frequency for -// CPU cycles if needed. +// Anti-DCE artifact: for each active voxel, accumulate the sum of all 18 +// tap uint64_t indices and write to sums[bID * BlockWidth + i]. The final +// XOR checksum is printed, forcing the compiler to materialise the stores. +// +// Timing: nanovdb::util::Timer (steady_clock) around each forEach. +// warm pass discards its measurement; only the second pass is reported. +// +// Denominator: grid->activeVoxelCount() — same for both paths. // ============================================================ -static void runPerf(const GridT* grid, - const nanovdb::tools::VoxelBlockManagerHandle& vbmHandle) +static void runPerf( + const GridT* grid, + const nanovdb::tools::VoxelBlockManagerHandle& vbmHandle) { const LeafT* firstLeaf = grid->tree().getFirstNode<0>(); + const uint64_t nVoxels = grid->activeVoxelCount(); const uint32_t nBlocks = (uint32_t)vbmHandle.blockCount(); const uint32_t* firstLeafID = vbmHandle.hostFirstLeafID(); const uint64_t* jumpMap = vbmHandle.hostJumpMap(); - - alignas(64) uint32_t leafIndex[BlockWidth]; - alignas(64) uint16_t voxelOffset[BlockWidth]; - - static constexpr int kBatchesPerBlock = BlockWidth / SIMDw; - - // Shared decode + moveTo loop, run twice (warmup then timed). - uint64_t totalTicks = 0; - - for (int pass = 0; pass < 2; ++pass) { - uint64_t passTicks = 0; - - for (uint32_t bID = 0; bID < nBlocks; ++bID) { - const uint64_t blockFirstOffset = - vbmHandle.firstOffset() + (uint64_t)bID * BlockWidth; - - // Decode is outside the timed region. - CPUVBM::decodeInverseMaps( - grid, firstLeafID[bID], - &jumpMap[(uint64_t)bID * CPUVBM::JumpMapLength], - blockFirstOffset, leafIndex, voxelOffset); - - int nExtraLeaves = 0; - for (int w = 0; w < CPUVBM::JumpMapLength; ++w) - nExtraLeaves += nanovdb::util::countOn( - jumpMap[(uint64_t)bID * CPUVBM::JumpMapLength + w]); - - SAccT stencilAcc(*grid, firstLeafID[bID], (uint32_t)nExtraLeaves); - - _mm_lfence(); - const uint64_t t0 = __rdtsc(); - - for (int b = 0; b < kBatchesPerBlock; ++b) - stencilAcc.moveTo(leafIndex + b * SIMDw, voxelOffset + b * SIMDw); - - uint32_t aux; - const uint64_t t1 = __rdtscp(&aux); - - passTicks += (t1 - t0); - } - - if (pass == 1) totalTicks = passTicks; // only record the warm pass - } - - const uint64_t totalBatches = (uint64_t)nBlocks * kBatchesPerBlock; - const uint64_t totalVoxels = (uint64_t)nBlocks * BlockWidth; - - std::printf("\nStencilAccessor::moveTo throughput (warm pass, %u blocks):\n", nBlocks); - std::printf(" total TSC ticks : %lu\n", totalTicks); - std::printf(" ticks / batch : %.1f\n", double(totalTicks) / double(totalBatches)); - std::printf(" ticks / voxel : %.2f\n", double(totalTicks) / double(totalVoxels)); + const uint64_t firstOffset = vbmHandle.firstOffset(); + + // Anti-DCE output array. Each thread writes its own non-overlapping + // range (bID * BlockWidth ... + BlockWidth - 1) — no synchronisation needed. + std::vector sums((size_t)nBlocks * BlockWidth, 0); + + std::ostringstream sink; // absorbs Timer's warm-pass "... " output + nanovdb::util::Timer timer; + + auto timeForEach = [&](auto&& body) -> double { + // warm pass + timer.start("", sink); + body(); + timer.elapsed(); + // timed pass + timer.start("", sink); + body(); + return static_cast(timer.elapsed()); + }; + + // ---- decodeInverseMaps-only baseline (both paths pay this cost) ---- + // Anti-DCE: XOR one uint64_t per block derived from leafIndex[] + voxelOffset[] + // so the compiler can't elide the decode work. + const double decodeUs = timeForEach([&] { + nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), + [&](const nanovdb::util::Range1D& range) { + alignas(64) uint32_t leafIndex[BlockWidth]; + alignas(64) uint16_t voxelOffset[BlockWidth]; + + for (size_t bID = range.begin(); bID != range.end(); ++bID) { + CPUVBM::decodeInverseMaps( + grid, firstLeafID[bID], + &jumpMap[bID * CPUVBM::JumpMapLength], + firstOffset + bID * BlockWidth, + leafIndex, voxelOffset); + + uint64_t acc = 0; + for (int i = 0; i < BlockWidth; ++i) + acc ^= (uint64_t(leafIndex[i]) << 16) | uint64_t(voxelOffset[i]); + sums[bID * BlockWidth] = acc; // one slot per block as anti-DCE + } + }); + }); + + // ---- StencilAccessor ---- + std::fill(sums.begin(), sums.end(), uint64_t(0)); + + const double stencilUs = timeForEach([&] { + nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), + [&](const nanovdb::util::Range1D& range) { + alignas(64) uint32_t leafIndex[BlockWidth]; + alignas(64) uint16_t voxelOffset[BlockWidth]; + + for (size_t bID = range.begin(); bID != range.end(); ++bID) { + CPUVBM::decodeInverseMaps( + grid, firstLeafID[bID], + &jumpMap[bID * CPUVBM::JumpMapLength], + firstOffset + bID * BlockWidth, + leafIndex, voxelOffset); + + int nExtraLeaves = 0; + for (int w = 0; w < CPUVBM::JumpMapLength; ++w) + nExtraLeaves += nanovdb::util::countOn( + jumpMap[bID * CPUVBM::JumpMapLength + w]); + + SAccT stencilAcc(*grid, firstLeafID[bID], (uint32_t)nExtraLeaves); + uint64_t* bs = sums.data() + bID * BlockWidth; + + for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { + const IndexMaskT active = + stencilAcc.moveTo(leafIndex + batchStart, voxelOffset + batchStart); + for (int i = 0; i < SIMDw; ++i) { + if (!active[i]) continue; + uint64_t s = 0; + for (int k = 0; k < SAccT::size(); ++k) + s += static_cast(stencilAcc[k][i]); + bs[batchStart + i] = s; + } + } + } + }); + }); + + const uint64_t stencilChecksum = + std::accumulate(sums.begin(), sums.end(), uint64_t(0), + [](uint64_t a, uint64_t b) { return a ^ b; }); + + // ---- LegacyStencilAccessor ---- + std::fill(sums.begin(), sums.end(), uint64_t(0)); + + const double legacyUs = timeForEach([&] { + nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), + [&](const nanovdb::util::Range1D& range) { + alignas(64) uint32_t leafIndex[BlockWidth]; + alignas(64) uint16_t voxelOffset[BlockWidth]; + LegacyAccT legacyAcc(*grid); // one ReadAccessor per task + uint64_t* bs0 = sums.data(); + + for (size_t bID = range.begin(); bID != range.end(); ++bID) { + CPUVBM::decodeInverseMaps( + grid, firstLeafID[bID], + &jumpMap[bID * CPUVBM::JumpMapLength], + firstOffset + bID * BlockWidth, + leafIndex, voxelOffset); + + uint64_t* bs = bs0 + bID * BlockWidth; + + for (int i = 0; i < BlockWidth; ++i) { + if (leafIndex[i] == CPUVBM::UnusedLeafIndex) continue; + const uint16_t vo = voxelOffset[i]; + const uint32_t li = leafIndex[i]; + const nanovdb::Coord cOrigin = firstLeaf[li].origin(); + const int lx = (vo >> 6) & 7, ly = (vo >> 3) & 7, lz = vo & 7; + legacyAcc.moveTo(cOrigin + nanovdb::Coord(lx, ly, lz)); + uint64_t s = 0; + for (int k = 0; k < LegacyAccT::size(); ++k) s += legacyAcc[k]; + bs[i] = s; + } + } + }); + }); + + const uint64_t legacyChecksum = + std::accumulate(sums.begin(), sums.end(), uint64_t(0), + [](uint64_t a, uint64_t b) { return a ^ b; }); + + std::printf("\nEnd-to-end stencil gather (%u blocks, %lu active voxels):\n", + nBlocks, nVoxels); + std::printf(" decodeInverseMaps only: %7.1f ms (%5.1f ns/voxel)\n", + decodeUs / 1e3, decodeUs * 1e3 / double(nVoxels)); + std::printf(" StencilAccessor : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", + stencilUs / 1e3, stencilUs * 1e3 / double(nVoxels), + (stencilUs - decodeUs) / 1e3, stencilChecksum); + std::printf(" LegacyStencilAccessor : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", + legacyUs / 1e3, legacyUs * 1e3 / double(nVoxels), + (legacyUs - decodeUs) / 1e3, legacyChecksum); + + if (stencilChecksum != legacyChecksum) + std::cerr << " WARNING: checksums differ — accessor results disagree!\n"; } // ============================================================ @@ -1001,10 +414,7 @@ static void runPerf(const GridT* int main(int argc, char** argv) { try { - // Targeted sentinel test runs unconditionally before any VBM data is needed. - verifyComputeNeededDirsSentinel(); - - int ambient_voxels = 1024 * 1024; // smaller default than the CUDA test + int ambient_voxels = 1024 * 1024; float occupancy = 0.5f; if (argc > 1) ambient_voxels = std::stoi(argv[1]); @@ -1023,8 +433,6 @@ int main(int argc, char** argv) buildGrid.tree().setValue(coord, 1.f); // Convert build::Grid → NanoGrid → NanoGrid. - // Two-step because createNanoGrid accepts NanoGrid - // as its source type (same path as ex_index_grid_cuda). auto floatHandle = nanovdb::tools::createNanoGrid(buildGrid); auto indexHandle = nanovdb::tools::createNanoGrid< nanovdb::NanoGrid, @@ -1042,7 +450,6 @@ int main(int argc, char** argv) << " Upper=" << tree.nodeCount(2) << " Active=" << grid->activeVoxelCount() << "\n"; - // Build VBM. auto vbmHandle = nanovdb::tools::buildVoxelBlockManager(grid); std::cout << "VBM blocks=" << vbmHandle.blockCount() << " (BlockWidth=" << BlockWidth << ")\n\n"; diff --git a/nanovdb/nanovdb/util/BatchAccessor.md b/nanovdb/nanovdb/util/BatchAccessor.md index 47a3d0bfbe..9febb5b6d8 100644 --- a/nanovdb/nanovdb/util/BatchAccessor.md +++ b/nanovdb/nanovdb/util/BatchAccessor.md @@ -722,6 +722,183 @@ threads access adjacent values) and **cache footprint** (keeping the neighbour-l working set in L1/shared memory), rather than the gather-chain depth that dominates on CPU. +### 8h. End-to-end perf: outlining, `[[gnu::flatten]]`, and W=8 + +§8f measured `cachedGetValue` as a standalone symbol. This section measures the +**full WENO5 pipeline end-to-end** — `StencilAccessor::moveTo` driving 18 taps × +128 voxels/block × 131072 blocks across 32 TBB threads — and reveals a much +larger GCC pathology that a single-function measurement cannot see. + +Workload: `ex_stencil_gather_cpu 33554432 0.5` (16 M active voxels, 50% occupancy, +i9-285K Arrow Lake, 32 threads, `-O3 -march=native`). Time is wall clock via +`nanovdb::util::Timer`; checksum-matches `LegacyStencilAccessor` in every run. + +#### End-to-end latency (ns/voxel, smaller is better) + +| Variant | GCC 13 | Clang 18 | +|--------------------------------------|-------:|---------:| +| No `flatten` | 7.5 | 4.3 | +| `flatten` on `BatchAccessor::{prefetch,cachedGetValue}` | 4.9 | 4.3 | +| `flatten` on `StencilAccessor::moveTo` (full transitive) | **3.7** | 4.3 | +| `LegacyStencilAccessor` reference | 5.4 | 6.7 | + +Without `flatten`, GCC's SIMD `StencilAccessor` is **39% slower than the scalar +`ReadAccessor`-based `LegacyStencilAccessor`** — the SIMD abstraction turns into a +net loss. With `[[gnu::flatten]]` on `moveTo`, GCC becomes 33% faster than scalar +and edges out Clang. + +#### Per-batch call accounting (GCC, W=16) + +`moveTo` processes 16 voxels per batch. Per-batch call count is the product of: + +| Call site | No flatten | moveTo flatten | +|-------------------------------------|-----------:|---------------:| +| `moveTo` → `prefetchHull`, `calcTaps` | 3 | inlined | +| `prefetchHull` internals | 12 | inlined | +| `calcTaps` → 18× `cachedGetValue` + 18× `WhereExpression::op=` | 37 | inlined | +| Inside each `cachedGetValue`: 14 outlined Simd.h helpers × 18 | 252 | inlined | +| Stack-canary / misc | 19 | 0 | +| **Total calls per batch** | **~323** | **0** | +| **Total `vzeroupper` per batch** | **~282** | **1** (epilogue) | + +At 16 voxels/batch, that is **~18 `vzeroupper` per voxel** without flatten. Each +VZU is cheap (~1–2 cycles) but serves as a strong ABI barrier that defeats the +out-of-order engine's ability to overlap pre- and post-call work. Combined with +the per-call argument marshaling of `_Fixed<16>` aggregates (128 B by reference), +the accumulated cost is the full 3.2 ns/voxel gap between the two variants. + +#### Why outlining happens under GCC + +Each Simd.h helper (`gather`, `gather_if`, `simd_cast`, `simd_cast_if`, `where`, +`popcount`, `WhereExpression::op=`) is an `inline` template. With `-O3`, GCC's +inliner decides each is "too expensive to inline" once the caller +(`cachedGetValue`, ~900 B) reaches a growth-budget threshold. It emits each +helper as a weak COMDAT and calls it. Every such call takes `_Fixed<16>` +aggregates by reference (the parameter doesn't fit in YMM), triggering +`vzeroupper` on entry. + +The same pattern propagates up: `calcTaps` (after inlining) is too big to accept +18 copies of `cachedGetValue`, so GCC outlines those too — one weak symbol per +template instantiation. Then `StencilAccessor::moveTo` calls `calcTaps` and +`prefetchHull` across that same boundary. + +Clang's inliner makes different decisions — it inlines the Simd.h helpers into +each `cachedGetValue`, keeps `cachedGetValue` outlined per-tap, and accepts the +18 calls from `calcTaps`. Clang also emits hardware gathers under `-march=native` +(16 `vpgather` per tap, see §8f), amortising the per-call cost with faster +gather semantics. + +#### Why `[[gnu::flatten]]` on `moveTo` wins + +`__attribute__((flatten))` forces **every call** in the annotated function's body +to be inlined, recursively — overriding all cost heuristics. Applied to +`StencilAccessor::moveTo`, it collapses the entire call tree (`prefetchHull`, +`calcTaps`, 18× `cachedGetValue`, 14× helpers per tap) into one monolithic +inlined body. Observed: **0 calls, 1 `vzeroupper` (function epilogue only), +14 350 insns, 77 KB of text in a single symbol**. + +Trade-offs: + +- Binary size: one 77 KB function per `StencilAccessor` instantiation. L1i is + 32 KB, but the per-batch hot path only sweeps a small fraction of the body + linearly, so I-cache pressure is manageable. +- Debuggability: one giant symbol to step through vs 40+ small symbols. +- Compile time: GCC spends notably longer compiling a flattened `moveTo`. + +#### Why `flatten` on `BatchAccessor::prefetch`/`cachedGetValue` alone is insufficient + +Flattening at the BatchAccessor level inlines the 14 Simd.h helpers into each +`cachedGetValue`/`prefetch` body (so each of those becomes a clean, self-contained +~800-insn function with ≤2 residual calls — typically `WhereExpression::op=` and +the `_S_generator` stdx lambda for `popcount`). However it leaves the 18 +`cachedGetValue` call sites *themselves* outlined — `calcTaps` still pays 38 +calls and 26 `vzeroupper` per batch. Measured: 4.9 ns/voxel — halfway between +no-flatten and full-flatten. + +The signal is clear: the *outer* `moveTo` → `calcTaps` → per-tap call boundary +is the dominant cost, not the inner helper-call boundary. + +#### W=8 experiment (batch-width halving) + +Motivation: halving the batch width reduces register pressure and spill volume, +and shifts some types from `_Fixed` to `_VecBuiltin<32>` (the native +`__m256i` ABI). Specifically at W=8: + +- `Simd` — 16 B, `_VecBuiltin<16>` (native XMM) +- `Simd` — 32 B, `_VecBuiltin<32>` (native YMM) ✓ register-passable +- `Simd` — 64 B, still `_Fixed<8>` (2× YMM aggregate, not passable) +- `Simd` — same as uint64 + +Only the `uint32_t` leaf-ID/mask vectors become register-passable; the dominant +`uint64_t` index vectors are still aggregate (half the size of the W=16 +aggregate, but still stack-passed). + +Measured at W=8 with full flatten: + +| Metric | W=16 | W=8 | Δ | +|-------------------------|--------:|--------:|--------:| +| `moveTo` text size | 77 KB | 34 KB | −56% | +| `moveTo` insns | 14,349 | 7,182 | −50% | +| YMM spill stores | 469 | 67 | **−86%**| +| YMM spill loads | 351 | 167 | −52% | +| vpinsrq (software-gather glue) | 432 | 216 | −50% | +| `vpgather*` | 0 | 0 | unchanged | +| `vzeroupper` | 1 | 1 | unchanged | +| **End-to-end (GCC)** | **3.7 ns/vox** | 4.2 ns/vox | +0.5 | +| **End-to-end (Clang)** | 4.3 ns/vox | 4.0 ns/vox | −0.3 | + +W=8 dramatically reduces register pressure (the spill count is 86% lower). But +GCC's end-to-end time regresses by 0.5 ns/voxel because the per-batch framing +cost (`zeroIndices`, `leafSlice == centerLeafID` mask compute, straddling +loop control, `prefetchHull`) is now amortised across only 8 lanes instead of +16. The body of `moveTo` halved; the surrounding scaffolding doubled. + +Clang benefits slightly (−0.3 ns/voxel), likely because its outlined +`cachedGetValue` was paying more call-frame marshaling at W=16 (4× YMM aggregate +vs 2× YMM at W=8). + +**Takeaway for future design**: W=8 would become attractive if the per-batch +framing work can be amortised across multiple adjacent batches — for example, +hoisting `prefetchHull` outside the batch loop for cases where the hull mask +is invariant across several batches of the same center-leaf. + +#### Findings + +**F1 — GCC's default codegen for this abstraction is broken.** Without +`flatten` or equivalent attributes, GCC emits ~323 calls / ~282 `vzeroupper` +per 16-voxel batch, making the SIMD `StencilAccessor` *slower* than the scalar +`LegacyStencilAccessor`. + +**F2 — `[[gnu::flatten]]` on `StencilAccessor::moveTo` restores performance.** +One attribute, targeting the WENO5 pipeline entry point, drops GCC from 7.5 to +3.7 ns/voxel (2×) and makes GCC the fastest of the measured configurations. + +**F3 — Partial flattening at `BatchAccessor::{prefetch,cachedGetValue}` is not +enough.** The inner helper calls are eliminated but the 18 `cachedGetValue` +call sites themselves remain — 4.9 ns/voxel. + +**F4 — Hardware gathers are not needed on Arrow Lake.** GCC emits 0 `vpgather` +in all variants; Clang+native emits 16 per `cachedGetValue`. GCC's +software-gather path (scalar loads + `vpinsrq`) nevertheless beats Clang's +hardware-gather path end-to-end (3.7 vs 4.3 ns/voxel) because the three load +ports issue the scalar gathers in parallel and the out-of-order engine hides +the latency. §8f Finding 5 (unmasked-gather auto-vectorisation) remains +correct; it is simply not load-bearing on this microarchitecture. + +**F5 — W=8 reduces spills dramatically but does not help end-to-end on GCC.** +Per-batch framing cost dominates at smaller widths. + +**F6 — Clang's performance is relatively insensitive to these knobs.** +Clang inlines the Simd.h helpers regardless of `flatten`, and its outlined +`cachedGetValue` pays only moderate call overhead. Both 4.0–4.3 ns/voxel +across all variants tested. + +**Not applied.** The codebase does not ship `[[gnu::flatten]]` by default. +StencilAccessor-style callers that require peak GCC performance may apply it +to their own hot entry point; the attribute is safe and a no-op under Clang. +This choice keeps the library's default codegen predictable and avoids forcing +a 77 KB monolithic body on callers with smaller working sets. + --- ## 9. Relationship to Phase 1 Prototype @@ -759,12 +936,25 @@ on CPU. `maskWords=0` for invalid lanes → `isActive=false` without cross-width mask AND. Verified: 12M lane-checks pass across all 18 WENO5 taps. Unlocks hardware `vpgatherqq` in the array backend under Clang + native tuning. +- **End-to-end codegen analysis (§8h)**: measured the full WENO5 pipeline + (`StencilAccessor::moveTo` × 131 K blocks × 32 threads) on i9-285K Arrow Lake. + Established that GCC's default `-O3` outlines 14 Simd.h helpers per + `cachedGetValue` and outlines `cachedGetValue`/`WhereExpression::op=` per tap, + producing ~282 `vzeroupper` per 16-voxel batch and making the SIMD path + slower than scalar `LegacyStencilAccessor`. `[[gnu::flatten]]` on + `StencilAccessor::moveTo` collapses the full call tree and drops GCC from + 7.5 to 3.7 ns/voxel (2×), beating Clang's 4.3 ns/voxel. W=8 cuts spills by + 86% but regresses GCC end-to-end due to per-batch framing overhead. + Attributes **not applied** in the shipped code; see §8h "Not applied" note. ### Remaining -- **`[[gnu::always_inline]]` on `Simd.h` helpers:** `gather_if`, `simd_cast`, - `simd_cast_if`, `where`, `popcount` — eliminates 13 `vzeroupper` transitions per - `cachedGetValue` call under GCC 13 (§8f). +- **`[[gnu::always_inline]]` on `Simd.h` helpers** (§8f) vs + **`[[gnu::flatten]]` on StencilAccessor-style entry points** (§8h): + two candidate approaches to restore GCC inlining. The flatten path was + measured end-to-end (2× speedup); the always_inline path was measured only + on the standalone `cachedGetValue` symbol. Decide which to ship once a + consumer of StencilAccessor exists in the production build. - **`vpshufb`-based `popcount` in `Simd.h`:** replace `popcount64` SWAR tree with nibble-LUT + `vpsadbw` pattern (§8f); reduces the out-of-line body from 88 to ≈40 diff --git a/nanovdb/nanovdb/util/LegacyStencilAccessor.h b/nanovdb/nanovdb/util/LegacyStencilAccessor.h new file mode 100644 index 0000000000..6fe63df257 --- /dev/null +++ b/nanovdb/nanovdb/util/LegacyStencilAccessor.h @@ -0,0 +1,114 @@ +// Copyright Contributors to the OpenVDB Project +// SPDX-License-Identifier: Apache-2.0 + +/*! + \file LegacyStencilAccessor.h + + \brief Scalar stencil-index accessor using a NanoVDB ReadAccessor. + + LegacyStencilAccessor resolves each stencil tap via a path-cached + NanoVDB ReadAccessor, one voxel at a time. It is templatized on the + same StencilT policy class used by StencilAccessor, so the tap-offset + table is shared at compile time. + + This mirrors the approach of OpenVDB's math/Stencils.h: the accessor + caches the last-visited tree path so that consecutive taps within the + same leaf are cheap, but distant taps (e.g. WENO5 radius-3 offsets) + can evict the center-leaf path. That cache-pressure problem is the + motivation for the BatchAccessor / StencilAccessor design. + + Intended uses + ------------- + - Correctness oracle for StencilAccessor: sharing StencilT guarantees + identical tap offsets, so a mismatch is a genuine bug. + - Benchmark baseline: measures the cost of the accessor path-eviction + problem that StencilAccessor is designed to eliminate. + + Thread safety + ------------- + Each instance owns its ReadAccessor. Construct one per thread. + + Template parameters + ------------------- + BuildT NanoVDB build type (e.g. ValueOnIndex). + StencilT Policy class describing the stencil. Must expose: + using Taps = std::tuple...>; + Same type as passed to StencilAccessor. +*/ + +#pragma once + +#include +#include // StencilPoint, detail::findIndex + +#include +#include +#include +#include // std::index_sequence, std::make_index_sequence + +namespace nanovdb { + +template +class LegacyStencilAccessor +{ + using GridT = NanoGrid; + + static constexpr int SIZE = int(std::tuple_size_v); + +public: + explicit LegacyStencilAccessor(const GridT& grid) + : mAcc(grid.getAccessor()) {} + + // ------------------------------------------------------------------------- + // moveTo -- resolve all SIZE tap indices for the voxel at @a center. + // + // Calls ReadAccessor::getValue(center + offset) for each tap in StencilT::Taps. + // The path cache inside mAcc amortizes tree-traversal cost for nearby taps, + // but distant taps (e.g. WENO5 ±3) may evict the center-leaf path. + // + // Results are valid until the next moveTo() call. + // ------------------------------------------------------------------------- + void moveTo(const Coord& center) + { + fillTaps(center, std::make_index_sequence{}); + } + + // ------------------------------------------------------------------------- + // operator[] -- indexed tap access. i must be in [0, SIZE). + // ------------------------------------------------------------------------- + uint64_t operator[](int i) const { return mStencil[i]; } + + // ------------------------------------------------------------------------- + // getValue -- compile-time named tap access. + // + // Same interface as StencilAccessor::getValue; resolved entirely at + // compile time via detail::findIndex. + // ------------------------------------------------------------------------- + template + uint64_t getValue() const + { + constexpr int I = detail::findIndex( + std::make_index_sequence{}); + static_assert(I >= 0, "LegacyStencilAccessor::getValue: tap not in stencil"); + return mStencil[I]; + } + + static constexpr int size() { return SIZE; } + +private: + template + void fillTaps(const Coord& center, std::index_sequence) + { + using Taps = typename StencilT::Taps; + ((mStencil[Is] = static_cast( + mAcc.getValue(center + Coord( + std::tuple_element_t::di, + std::tuple_element_t::dj, + std::tuple_element_t::dk)))), ...); + } + + DefaultReadAccessor mAcc; + uint64_t mStencil[SIZE]; +}; + +} // namespace nanovdb diff --git a/nanovdb/nanovdb/util/StencilAccessor.md b/nanovdb/nanovdb/util/StencilAccessor.md index 2f6805026b..ee86340506 100644 --- a/nanovdb/nanovdb/util/StencilAccessor.md +++ b/nanovdb/nanovdb/util/StencilAccessor.md @@ -380,6 +380,21 @@ Both loops expand to zero-overhead compile-time instantiations: where `blendOneTap` calls `cachedGetValue` into a temporary and then `where`-blends into `mIndices[I]`. +### 8.1 GCC codegen note — `[[gnu::flatten]]` on `moveTo` + +Under GCC 13 + `-O3`, the default inliner outlines both the 14 Simd.h helpers +inside each `cachedGetValue` and the 18 per-tap `cachedGetValue` calls +themselves, producing ~282 `vzeroupper` transitions per 16-voxel batch and +making this whole SIMD pipeline measurably slower than the scalar +`LegacyStencilAccessor` oracle. Annotating `moveTo` with `[[gnu::flatten]]` +collapses the full call tree into a single ~77 KB inlined body, restoring +end-to-end performance from 7.5 ns/voxel to 3.7 ns/voxel (2×) and beating +Clang's 4.3 ns/voxel in the same test. The attribute is a no-op under Clang +(which inlines by default) and is safe to add, but the header does not apply +it by default — see `BatchAccessor.md` §8h for the measurement matrix and the +rationale for leaving it opt-in. Consumers that instantiate +`StencilAccessor` in hot GCC-compiled code paths should consider enabling it. + --- ## 9. `getValue()` — tap access by coordinate From a05dce441503c24fccf38086225dd8676e32af1d Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Sat, 18 Apr 2026 15:24:29 -0500 Subject: [PATCH 32/60] stencil_gather_cpu: add Legacy cost decomposition (framing / cache+leaf / tree-walk) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Decomposes LegacyStencilAccessor's 5.4 ns/voxel end-to-end cost into three components by running two additional timed passes before the full Legacy pass: - framing-only: full Legacy loop structure with no accessor call, using the center Coord components as the anti-DCE sum seed. Measures the decodeInverseMaps + Coord-compute + 18-iteration inner loop + store. - center-hit x 18: calls mAcc.getValue() 18 times per voxel on distinct coords that all lie within the center voxel's leaf (an 8x2x1 slab parameterised by k in [0..17]). Every call is a guaranteed leaf-cache hit, so there is no tree walk; the distinct coords prevent CSE. Measures framing + accessor cache-check + leaf-local lookup. - full: existing LegacyStencilAccessor path, for reference. Subtracting gives: cache-check + leaf-local lookup = center-hit - framing tree-walk cost (amortised over misses) = full - center-hit Measured breakdown (i9-285K Arrow Lake, 32 threads, 16.7 M active voxels): framing : ~0.25 ns/vox (5%) cache+leaf : ~0.94 ns/vox (17%, 0.05 ns/tap — fully load-port pipelined) tree walk : ~4.24 ns/vox (78%, serialised adjacent-leaf re-descents) Finding: tree walks on leaf-cache miss dominate LegacyStencilAccessor's cost despite affecting only ~25% of taps. Each miss is ~4.5 cycles (single lower-node re-descent), but they chain serially — unlike the leaf-local lookups which the OoO engine fully pipelines through the three load ports. This is the cost that a 27-leaf neighbour cache (BatchAccessor-style) eliminates. Signed-off-by: Efty Sifakis --- .../stencil_gather_cpu.cpp | 101 ++++++++++++++++++ 1 file changed, 101 insertions(+) diff --git a/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp b/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp index 582f3e6773..b2954be9af 100644 --- a/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp +++ b/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp @@ -353,6 +353,90 @@ static void runPerf( std::accumulate(sums.begin(), sums.end(), uint64_t(0), [](uint64_t a, uint64_t b) { return a ^ b; }); + // ---- Legacy cost decomposition variants ---- + // (a) "framing only" — Legacy loop structure, no accessor call (anti-DCE writes use li+k). + // Measures: decodeInverseMaps + Coord compute + 18-iteration inner loop + anti-DCE store. + // (b) "center-hit only" — Legacy loop + 18× mAcc.getValue(center) instead of tap offsets. + // Always hits the ReadAccessor's leaf cache → no tree walk. + // Measures: framing + cache-query + leaf-local lookup (mValueMask + mPrefixSum + popcount). + // (c) "full" — the original LegacyStencilAccessor path. + // Measures: framing + cache-query + leaf-local lookup + tree-walk-on-miss. + // + // Tree-walk cost per voxel ≈ full − center-hit. + // Cache + leaf-lookup per voxel ≈ center-hit − framing. + // Framing per voxel ≈ framing. + + std::fill(sums.begin(), sums.end(), uint64_t(0)); + const double framingUs = timeForEach([&] { + nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), + [&](const nanovdb::util::Range1D& range) { + alignas(64) uint32_t leafIndex[BlockWidth]; + alignas(64) uint16_t voxelOffset[BlockWidth]; + uint64_t* bs0 = sums.data(); + + for (size_t bID = range.begin(); bID != range.end(); ++bID) { + CPUVBM::decodeInverseMaps( + grid, firstLeafID[bID], + &jumpMap[bID * CPUVBM::JumpMapLength], + firstOffset + bID * BlockWidth, + leafIndex, voxelOffset); + + uint64_t* bs = bs0 + bID * BlockWidth; + for (int i = 0; i < BlockWidth; ++i) { + if (leafIndex[i] == CPUVBM::UnusedLeafIndex) continue; + const uint16_t vo = voxelOffset[i]; + const uint32_t li = leafIndex[i]; + const nanovdb::Coord cOrigin = firstLeaf[li].origin(); + const int lx = (vo >> 6) & 7, ly = (vo >> 3) & 7, lz = vo & 7; + const nanovdb::Coord center = cOrigin + nanovdb::Coord(lx, ly, lz); + // 18 trivial "taps" — no accessor call; anti-DCE via Coord components. + uint64_t s = 0; + for (int k = 0; k < LegacyAccT::size(); ++k) + s += static_cast(center.x() + center.y() + center.z() + k); + bs[i] = s; + } + } + }); + }); + + std::fill(sums.begin(), sums.end(), uint64_t(0)); + const double centerHitUs = timeForEach([&] { + nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), + [&](const nanovdb::util::Range1D& range) { + alignas(64) uint32_t leafIndex[BlockWidth]; + alignas(64) uint16_t voxelOffset[BlockWidth]; + auto acc = grid->getAccessor(); + uint64_t* bs0 = sums.data(); + + for (size_t bID = range.begin(); bID != range.end(); ++bID) { + CPUVBM::decodeInverseMaps( + grid, firstLeafID[bID], + &jumpMap[bID * CPUVBM::JumpMapLength], + firstOffset + bID * BlockWidth, + leafIndex, voxelOffset); + + uint64_t* bs = bs0 + bID * BlockWidth; + for (int i = 0; i < BlockWidth; ++i) { + if (leafIndex[i] == CPUVBM::UnusedLeafIndex) continue; + (void)voxelOffset[i]; // keep decode non-dead + const uint32_t li = leafIndex[i]; + const nanovdb::Coord cOrigin = firstLeaf[li].origin(); + // 18 distinct positions ALL within this leaf's 8^3 footprint + // — guarantees leaf-cache hit on every call, but each coord + // is unique so the compiler can't CSE the lookups. + // k in [0..17]: local (k&7, (k>>3)&1, 0) sweeps an 8x2x1 slab. + uint64_t s = 0; + for (int k = 0; k < LegacyAccT::size(); ++k) { + const nanovdb::Coord c = cOrigin + + nanovdb::Coord(k & 7, (k >> 3) & 1, 0); + s += static_cast(acc.getValue(c)); + } + bs[i] = s; + } + } + }); + }); + // ---- LegacyStencilAccessor ---- std::fill(sums.begin(), sums.end(), uint64_t(0)); @@ -403,6 +487,23 @@ static void runPerf( legacyUs / 1e3, legacyUs * 1e3 / double(nVoxels), (legacyUs - decodeUs) / 1e3, legacyChecksum); + // Decomposition of LegacyStencilAccessor's ns/voxel: + // framing = no accessor call + // cache + leaf = centerHit − framing (per 18 taps) + // tree walk = legacy − centerHit (per 18 taps; amortises over ~25% miss rate) + const double framingNs = framingUs * 1e3 / double(nVoxels); + const double centerHitNs = centerHitUs * 1e3 / double(nVoxels); + const double legacyNs = legacyUs * 1e3 / double(nVoxels); + std::printf("\nLegacy cost decomposition (18 taps/voxel):\n"); + std::printf(" framing only : %7.1f ms (%5.1f ns/voxel)\n", + framingUs / 1e3, framingNs); + std::printf(" + center-hit × 18 : %7.1f ms (%5.1f ns/voxel) [cache+leaf = %5.2f ns/vox = %4.2f ns/tap]\n", + centerHitUs / 1e3, centerHitNs, + centerHitNs - framingNs, (centerHitNs - framingNs) / 18.0); + std::printf(" + stencil × 18 (full): %7.1f ms (%5.1f ns/voxel) [tree walk = %5.2f ns/vox = %4.2f ns/tap]\n", + legacyUs / 1e3, legacyNs, + legacyNs - centerHitNs, (legacyNs - centerHitNs) / 18.0); + if (stencilChecksum != legacyChecksum) std::cerr << " WARNING: checksums differ — accessor results disagree!\n"; } From c90db495bef4e69b12c47fed09ba85a68ea11574 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Sat, 18 Apr 2026 16:04:32 -0500 Subject: [PATCH 33/60] StencilAccessor/BatchAccessor: hybrid SIMD -> scalar-tail cachedGetValue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces BatchAccessor's fully-SIMD 18-step gather chain (§8e) with a hybrid design that keeps the naturally-SIMD work vectorised and hands the per-tap leaf lookup to a scalar loop. Motivated by the end-to-end codegen analysis in BatchAccessor.md §8f / §8h: - Without [[gnu::flatten]], GCC outlined the 14 Simd.h helpers inside cachedGetValue and the 18 per-tap calls from calcTaps, paying ~323 function calls and ~282 vzeroupper per 16-voxel batch and losing to scalar LegacyStencilAccessor (7.5 vs 5.4 ns/voxel). - With flatten, GCC ran at 3.7 ns/voxel but as a single 77 KB monolithic body per StencilAccessor instantiation — strong dependency on a compiler-specific attribute for competitive perf. Hybrid split (BatchAccessor.md §8i): Stays SIMD (native __m256i uint16 throughout — no _Fixed aggregate ABI, no gathers, no heterogeneous where-blends): - prefetch() — unchanged. - Setup in cachedGetValue: SWAR expansion, packed_sum, base-32 direction extraction (d_u16), packed-layout -> 9-bit local offset extraction (localOffset_u16). Goes scalar (via two util::store harvests and a bitmask leafMask): - Per-lane pointer chase into mNeighborLeafIDs + mFirstLeaf. - One leaf.getValue(offset) call per active lane — the LeafNode handles valueMask / mPrefixSum / popcount internally. Public API changes: BatchAccessor::cachedGetValue signature: - before: void cachedGetValue(ValueT& result, VoxelOffsetT vo, PredicateT mask) - after: void cachedGetValue(ScalarValueT (&dst)[LaneWidth], VoxelOffsetT vo, PredicateT mask) The C-array output allows the scalar tail to write lane results with a single mov, eliminating the 18× WhereExpression::operator= outlined body. StencilAccessor: - Storage: Simd mIndices[SIZE] -> uint64_t mIndices[SIZE][W], now public. Layout is part of the ABI. - moveTo() returns void (active-lane info is leafIndex[i] != UnusedLeafIndex, already available to the caller — no duplicate mask). - Removed getValue() and operator[]; added a static constexpr tapIndex() for reorder-safe compile-time named-tap access. - Public API now contains zero Simd<>/SimdMask<> types. Callers SIMD-load tap rows from mIndices[k] with whatever backend they choose, or iterate scalarly — we don't impose a choice. Simd.h additions: - util::store(v, p) — uniform store shim dispatching to stdx::copy_to on the stdx backend and to Simd::store on the array backend. BatchAccessor.h additions: - mFirstLeaf member (cached getFirstLeaf() base) for the scalar tail leaf lookup. End-to-end perf (32 M ambient / 50% / 32 threads, i9-285K Arrow Lake): Variant GCC ns/vox Clang ns/vox Old SIMD path, no flatten 7.5 4.3 Old SIMD path, +flatten on moveTo 3.7 4.3 Hybrid (this commit), no flatten 5.1 4.9 Hybrid +flatten on moveTo 4.8 4.8 LegacyStencilAccessor 5.5 6.7 Without flatten, the hybrid is 31% faster than the old SIMD path on GCC and within 0.2 ns/voxel on Clang, eliminating the 3x compiler spread that §8f / §8h documented and making both compilers beat the scalar LegacyStencilAccessor oracle. The cost on GCC (1.4 ns/voxel vs SIMD+flatten 3.7) is recoverable by re-applying flatten at the caller site — but the shipped code no longer requires it for acceptable performance. Correctness: 12M lane-checks across all 18 WENO5 taps pass against the LegacyStencilAccessor oracle; XOR checksum matches across the full 16.7 M active voxel workload. Docs: - BatchAccessor.md §8e marked "historical"; §8i added with the hybrid design, perf matrix, and cleanup notes (gather/simd_cast/popcount helpers in Simd.h are no longer exercised and can be removed in a follow-up). - StencilAccessor.md §8.1 rewritten to describe the Simd-free public API and the new caller pattern. Signed-off-by: Efty Sifakis --- .../stencil_gather_cpu.cpp | 58 ++-- nanovdb/nanovdb/util/BatchAccessor.h | 273 ++++++------------ nanovdb/nanovdb/util/BatchAccessor.md | 115 +++++++- nanovdb/nanovdb/util/Simd.h | 12 + nanovdb/nanovdb/util/StencilAccessor.h | 159 +++++----- nanovdb/nanovdb/util/StencilAccessor.md | 71 ++++- 6 files changed, 362 insertions(+), 326 deletions(-) diff --git a/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp b/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp index b2954be9af..6234739711 100644 --- a/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp +++ b/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp @@ -68,7 +68,6 @@ using CPUVBM = nanovdb::tools::VoxelBlockManager; using SAccT = nanovdb::StencilAccessor; using LegacyAccT = nanovdb::LegacyStencilAccessor; -using IndexMaskT = nanovdb::util::SimdMask; // ============================================================ // Test domain generation (mirrors vbm_host_cuda.cpp) @@ -120,13 +119,13 @@ struct VerifyStats { /// Cross-validate one StencilAccessor batch against LegacyStencilAccessor. /// -/// Active lanes: reconstruct the global coordinate from (leafIndex, voxelOffset), -/// call legacyAcc.moveTo(), and compare all SIZE tap indices element-by-element. +/// Active lanes (leafIndex[p] != UnusedLeafIndex): reconstruct the global +/// coordinate from (leafIndex, voxelOffset), call legacyAcc.moveTo(), and +/// compare all SIZE tap indices element-by-element. /// /// Inactive lanes: assert all tap slots in stencilAcc hold 0 (background index). static void verifyStencilAccessor( const SAccT& stencilAcc, - IndexMaskT activeMask, const uint32_t* leafIndex, const uint16_t* voxelOffset, int batchStart, @@ -134,29 +133,28 @@ static void verifyStencilAccessor( LegacyAccT& legacyAcc, VerifyStats& stats) { - // Inactive lanes: all tap slots must hold 0 (NanoVDB background index). for (int i = 0; i < SIMDw; ++i) { - if (activeMask[i]) continue; - for (int k = 0; k < stencilAcc.size(); ++k) { - ++stats.laneChecks; - const uint64_t got = static_cast(stencilAcc[k][i]); - if (got != 0) { - ++stats.errors; - if (stats.errors <= 10) - std::cerr << "STENCIL inactive lane=" << i - << " tap=" << k - << ": expected 0, got " << got << "\n"; + const int p = batchStart + i; + const uint32_t li = leafIndex[p]; + + if (li == CPUVBM::UnusedLeafIndex) { + // Inactive lane: all tap slots must hold 0 (NanoVDB background index). + for (int k = 0; k < stencilAcc.size(); ++k) { + ++stats.laneChecks; + const uint64_t got = stencilAcc.mIndices[k][i]; + if (got != 0) { + ++stats.errors; + if (stats.errors <= 10) + std::cerr << "STENCIL inactive lane=" << i + << " tap=" << k + << ": expected 0, got " << got << "\n"; + } } + continue; } - } - - // Active lanes: compare against the LegacyStencilAccessor oracle. - for (int i = 0; i < SIMDw; ++i) { - if (!activeMask[i]) continue; - const int p = batchStart + i; - const uint16_t vo = voxelOffset[p]; - const uint32_t li = leafIndex[p]; + // Active lane: compare against the LegacyStencilAccessor oracle. + const uint16_t vo = voxelOffset[p]; const nanovdb::Coord cOrigin = firstLeaf[li].origin(); const int lx = (vo >> 6) & 7, ly = (vo >> 3) & 7, lz = vo & 7; @@ -165,7 +163,7 @@ static void verifyStencilAccessor( for (int k = 0; k < stencilAcc.size(); ++k) { ++stats.laneChecks; const uint64_t expected = legacyAcc[k]; - const uint64_t actual = static_cast(stencilAcc[k][i]); + const uint64_t actual = stencilAcc.mIndices[k][i]; if (actual != expected) { ++stats.errors; if (stats.errors <= 10) @@ -218,9 +216,8 @@ static void runPrototype( SAccT stencilAcc(*grid, firstLeafID[bID], (uint32_t)nExtraLeaves); for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { - const IndexMaskT active = - stencilAcc.moveTo(leafIndex + batchStart, voxelOffset + batchStart); - verifyStencilAccessor(stencilAcc, active, + stencilAcc.moveTo(leafIndex + batchStart, voxelOffset + batchStart); + verifyStencilAccessor(stencilAcc, leafIndex, voxelOffset, batchStart, firstLeaf, legacyAcc, stats); } @@ -335,13 +332,12 @@ static void runPerf( uint64_t* bs = sums.data() + bID * BlockWidth; for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { - const IndexMaskT active = - stencilAcc.moveTo(leafIndex + batchStart, voxelOffset + batchStart); + stencilAcc.moveTo(leafIndex + batchStart, voxelOffset + batchStart); for (int i = 0; i < SIMDw; ++i) { - if (!active[i]) continue; + if (leafIndex[batchStart + i] == CPUVBM::UnusedLeafIndex) continue; uint64_t s = 0; for (int k = 0; k < SAccT::size(); ++k) - s += static_cast(stencilAcc[k][i]); + s += stencilAcc.mIndices[k][i]; bs[batchStart + i] = s; } } diff --git a/nanovdb/nanovdb/util/BatchAccessor.h b/nanovdb/nanovdb/util/BatchAccessor.h index 077bb6921b..995a82c0a6 100644 --- a/nanovdb/nanovdb/util/BatchAccessor.h +++ b/nanovdb/nanovdb/util/BatchAccessor.h @@ -148,6 +148,7 @@ class BatchAccessor , mCenterLeafID(firstLeafID) , mCenterOrigin(grid.tree().getFirstLeaf()[firstLeafID].origin()) , mProbedMask(1u << dir(0, 0, 0)) + , mFirstLeaf (grid.tree().getFirstLeaf()) , mOffsetBase (reinterpret_cast(&grid.tree().getFirstLeaf()[0].data()->mOffset)) , mPrefixBase (reinterpret_cast(&grid.tree().getFirstLeaf()[0].data()->mPrefixSum)) , mMaskWordBase(grid.tree().getFirstLeaf()[0].valueMask().words()) @@ -325,203 +326,92 @@ class BatchAccessor // appropriate neighbor leaf and calls leaf->getValue(offset). // // Requires prefetch (or any prefetch covering the same directions) - // to have been called first. Debug builds assert mProbedMask coverage. + // to have been called first. // - // A null leaf pointer (neighbor outside the narrow band) writes 0 to result. - // Inactive lanes (leafMask[i] == false) are not touched. + // A null leaf pointer (neighbor outside the narrow band) writes 0 to dst[lane]. + // Inactive lanes (bit lane of leafMask clear) are not touched. + // + // Output layout: `ScalarValueT (&dst)[LaneWidth]` — a plain C array, one + // entry per SIMD lane. This allows the scalar-tail loop below to write + // lane results with a single `mov`, avoiding the heterogeneous-mask + // where-blend that the old `Simd&` signature triggered. + // + // Hybrid design (BatchAccessor.md §8h / StencilAccessor.md §8.1): + // SIMD portion stays in native uint16_t __m256i (no aggregate ABI): + // SWAR expansion, packed_sum, base-32 direction extract (d_u16), + // local-offset extract (localOffset_u16). + // Harvest: two YMM stores to stack C arrays (neighborIdx[], localOffset[]). + // Scalar tail: per-lane pointer chase into mNeighborLeafIDs + mFirstLeaf, + // one leaf.getValue(offset) call. The LeafNode handles valueMask / + // prefixSum / popcount internally — one popcnt per lookup. // ------------------------------------------------------------------------- template - void cachedGetValue(ValueT& result, VoxelOffsetT vo, PredicateT leafMask) const + void cachedGetValue(ScalarValueT (&dst)[LaneWidth], + VoxelOffsetT vo, + PredicateT leafMask) const { - // ----------------------------------------------------------------------- - // SIMD ingredient fetch (WIP -- not yet wired to result) - // - // Recomputes packed_sum (same SWAR expansion as prefetch) to extract the - // three per-lane ingredients needed to replace leaf->getValue() with fully - // SIMD index arithmetic + value gather. See BatchAccessor.md Sec.8d. - // - // offsets -- leaf->mOffset: base value index for the leaf - // prefixSums -- leaf->mPrefixSum[w]: prefix popcount up to x-slice w - // maskWords -- leaf->mMask.mWords[w]: uint64_t mask for x-slice w - // - // w = dest_x = bits [10:12] of packed_sum (NanoVDB leaf layout: x is - // the most significant axis, so x-slices index the eight uint64_t words). - // - // dir per lane is extracted via the base-32 multiply trick (Sec.8d): - // v = (packed_sum & 0x6318u) >> 3 - // dir = (v * 1129u) >> 10 - // - // Note: exact field names (mOffset, mPrefixSum, mMask.mWords) need - // verification against LeafData in NanoVDB.h. - // ----------------------------------------------------------------------- - { - static constexpr auto packed_tap = - static_cast( - (unsigned(dk) + 8u) - | ((unsigned(dj) + 8u) << 5) - | ((unsigned(di) + 8u) << 10)); - const auto expanded = - ((vo | (vo << VoxelOffsetScalarT(4))) & VoxelOffsetT(kSwarXZMask)) - | ((vo << VoxelOffsetScalarT(2)) & VoxelOffsetT(kSwarYMask)); - - auto packed_lc = VoxelOffsetT(kSwarSentinel); - util::where(leafMask, packed_lc) = expanded; - const auto packed_sum = packed_lc + VoxelOffsetT(packed_tap); - - // dest_x per lane: bits [10:12] of packed_sum -> uint64_t mask word index (0..7) - const auto wordIdx_u16 = (packed_sum >> VoxelOffsetScalarT(10)) & VoxelOffsetT(7u); - - // SIMD gather of mOffset, mPrefixSum[w], and maskWords[w] per lane. - // - // Step 1 -- d_vec: per-lane dir (0..26) via base-32 multiply trick (Sec.8d). - // No widening needed: we extract bits [10:14] of (v * 1129). Those - // bits lie entirely below bit 16, so the modular uint16_t product gives - // the same answer as the full-width product for all valid + sentinel inputs. - // - // Step 2 -- tapLeafID_u32: gather mNeighborLeafIDs[d] for all lanes at once. - // - // Step 3 -- tapLeafOffset_i64: leaf_id * (sizeof(LeafT)/sizeof(uint64_t)). - // This is the per-lane uint64_t-stride index into the flat leaf array, - // viewed as a uint64_t[] through the base pointer of the target field. - // Invalid (kNullLeafID) lanes are clamped to index 0 (safe; masked out). - // - // Step 4 -- offsets / prefixSums: two gathers with different base pointers - // but the same tapLeafOffset_i64; masked to 0 for null lanes. - // mPrefixSum is a packed uint64_t: field w lives at bits [9*(w-1)+:9] - // (9-bit fields, w=0 -> prefix = 0 by definition). - // - // Step 5 -- maskWords: gather from valueMask().words() base. - // words()[wi] for leaf[leaf_id] = mask_word_base[leaf_id*kStride + wi]. - // The per-lane wi is added to tapLeafOffset_i64 to form the mask gather index. - // Direction-extraction constants (base-32 multiply trick, Sec.8d). - static constexpr uint16_t kSwarCarryMask = 0x6318u; // carry bits [3:4],[8:9],[13:14] - static constexpr uint16_t kDirMul = 1129u; // base-32 multiplier: 1*32^2 + 3*32 + 9 - static constexpr uint16_t kDirMask = 31u; // 5-bit digit mask - - // Step 1 -- d_vec: per-lane dir (0..26) via base-32 multiply (Sec.8d). - // Stay in uint16_t throughout: bits [10:14] of (v * 1129) are entirely - // within the lower 16 bits, so the modular uint16_t product gives the - // same result as the full-width product for all valid inputs. - const auto d_u16 = (((packed_sum & VoxelOffsetT(kSwarCarryMask)) - >> VoxelOffsetScalarT(3)) - * VoxelOffsetT(kDirMul) - >> VoxelOffsetScalarT(10)) - & VoxelOffsetT(kDirMask); - const auto d_i32 = util::simd_cast(d_u16); - - // Step 2 -- leaf IDs: unmasked gather (all lanes have d_i32 ∈ [0,26] by - // SWAR invariant, so mNeighborLeafIDs[d_i32[i]] is always a valid access). - // Non-leafMask lanes read the current center leaf's neighbor at direction d, - // which is filtered out by the explicit leafMask AND in valid_u32 below. - const LeafIDVecT tapLeafID_u32 = util::gather(mNeighborLeafIDs, d_i32); - const auto valid_u32 = leafMask & (tapLeafID_u32 != LeafIDVecT(kNullLeafID)); - - // Step 3 -- stride-scaled gather indices (widened to int64_t, invalid lanes -> 0) - // kStride is sizeof(LeafT)/sizeof(uint64_t); the static_assert makes the - // divisibility assumption explicit (NanoVDB leaves are always 8-byte aligned). - static_assert(sizeof(LeafT) % sizeof(uint64_t) == 0, - "LeafT must be uint64_t-aligned for packed gather indexing"); - static constexpr int64_t kStride = int64_t(sizeof(LeafT) / sizeof(uint64_t)); - using Int64VecT = std::conditional_t>; - Int64VecT tapLeafOffset_i64(0); - util::simd_cast_if(tapLeafOffset_i64, valid_u32, tapLeafID_u32); - tapLeafOffset_i64 = tapLeafOffset_i64 * Int64VecT(kStride); - - // Step 4a -- offsets (mOffset): unmasked gather. - // Invalid lanes have tapLeafOffset_i64=0 (from simd_cast_if), reading from - // index 0 (center leaf's data). These lanes are excluded by isActive in Step 7. - const LeafDataVecT offsets = util::gather(mOffsetBase, tapLeafOffset_i64); - - // Step 4b -- prefixSums (mPrefixSum packed uint64_t, shift-extract field w): - // unmasked gather for the same reason as Step 4a. After the shift-extract - // below, invalid-lane values don't matter because isActive filters them in Step 8. - LeafDataVecT prefixSums = util::gather(mPrefixBase, tapLeafOffset_i64); - const auto wordIdx_u64 = util::simd_cast(wordIdx_u16); - const auto nonzero_w = (wordIdx_u64 != LeafDataVecT(0)); - const auto shift = util::where(nonzero_w, (wordIdx_u64 - LeafDataVecT(1)) * LeafDataVecT(9), LeafDataVecT(0)); - prefixSums = util::where(nonzero_w, (prefixSums >> shift) & LeafDataVecT(511u), LeafDataVecT(0)); - - // Step 5 -- maskWords (valueMask().words()[w]) - // mMaskWordBase[leaf_id*kStride + w] == leaf[leaf_id].valueMask().words()[w] - // because the mask field is at a fixed offsetof within every LeafT. - // Kept as gather_if (masked) so that invalid lanes get maskWords=0, which - // guarantees isActive=false in Step 7 without needing a cross-width mask AND. - const auto wordIdx_i64 = util::simd_cast(wordIdx_u16); - const auto mask_idx = tapLeafOffset_i64 + wordIdx_i64; - LeafDataVecT maskWords(0); - util::gather_if(maskWords, valid_u32, mMaskWordBase, mask_idx); - // Step 6 -- dest_yz: 6-bit intra-word bit position (ny_w*8 + nz_w). - // packed_sum bits [5:7] = dest_y, bits [0:2] = dest_z (both wrapped mod 8). - const auto dest_yz_u16 = ((packed_sum >> VoxelOffsetScalarT(2)) & VoxelOffsetT(0x38u)) - | (packed_sum & VoxelOffsetT(0x07u)); - const auto dest_yz_u64 = util::simd_cast(dest_yz_u16); - - // Step 7 -- activity check + truncated maskWord. - // If voxel dest_yz is inactive, getValue returns 0 (not the formula below). - // Null-leaf lanes already have maskWords=0, so they are implicitly inactive. - const auto voxelBit = LeafDataVecT(1) << dest_yz_u64; - const auto isActive = (maskWords & voxelBit) != LeafDataVecT(0); - const auto truncated = maskWords & (voxelBit - LeafDataVecT(1)); - - // Step 8 -- fill result in-place; leafMask-clear lanes are untouched. - util::where(isActive, result) = offsets + prefixSums + util::popcount(truncated); - - // ------------------------------------------------------------------- - // Debug cross-check: validate SIMD-path values against scalar ref - // ------------------------------------------------------------------- -#ifndef NDEBUG - using LeafDataVecTraits = util::simd_traits; - for (int i = 0; i < LaneWidth; ++i) { - if (!Pred_traits::get(leafMask, i)) continue; // only check lanes caller asked about - - // Scalar reference: same arithmetic as the legacy loop below - const auto vo_i = static_cast(VO_traits::get(vo, i)); - const int lx = (vo_i >> 6) & 7, ly = (vo_i >> 3) & 7, lz = vo_i & 7; - const int nx = lx + di, ny = ly + dj, nz = lz + dk; - const int dx = (nx < 0) ? -1 : (nx >= 8) ? 1 : 0; - const int dy = (ny < 0) ? -1 : (ny >= 8) ? 1 : 0; - const int dz = (nz < 0) ? -1 : (nz >= 8) ? 1 : 0; - const int d_ref = dir(dx, dy, dz); - const int nx_w = nx - dx * 8; - const int ny_w = ny - dy * 8; - const int nz_w = nz - dz * 8; - const uint32_t ref_id = mNeighborLeafIDs[d_ref]; - const LeafT* ref = (ref_id != kNullLeafID) - ? &mGrid.tree().getFirstLeaf()[ref_id] : nullptr; - - // SIMD-path values for this lane - const uint32_t ps_i = static_cast(VO_traits::get(packed_sum, i)); - const int d_simd = int((((ps_i & 0x6318u) >> 3) * 1129u >> 10) & 31u); - const int wi = int(VO_traits::get(wordIdx_u16, i)); - - assert(d_simd == d_ref && "cachedGetValue SIMD: dir mismatch"); - assert(wi == nx_w && "cachedGetValue SIMD: w (dest_x) mismatch"); - - if (ref) { - const uint64_t pfx_ref = (uint32_t(nx_w) > 0u) - ? (ref->data()->mPrefixSum >> (9u * (uint32_t(nx_w) - 1u))) & 511u - : uint64_t(0); - const uint32_t ref_offset = uint32_t(nx_w)*64u + uint32_t(ny_w)*8u + uint32_t(nz_w); - assert(LeafDataVecTraits::get(offsets, i) == ref->data()->mOffset - && "cachedGetValue SIMD: mOffset mismatch"); - assert(LeafDataVecTraits::get(prefixSums, i) == pfx_ref - && "cachedGetValue SIMD: mPrefixSum mismatch"); - assert(LeafDataVecTraits::get(maskWords, i) == ref->valueMask().words()[nx_w] - && "cachedGetValue SIMD: maskWord mismatch"); - assert(Val_traits::get(result, i) == static_cast(ref->getValue(ref_offset)) - && "cachedGetValue SIMD: final result mismatch"); - } else { - assert(LeafDataVecTraits::get(offsets, i) == uint64_t(0) - && "cachedGetValue SIMD: null leaf offsets should be 0"); - assert(LeafDataVecTraits::get(prefixSums, i) == uint64_t(0) - && "cachedGetValue SIMD: null leaf prefixSums should be 0"); - assert(LeafDataVecTraits::get(maskWords, i) == uint64_t(0) - && "cachedGetValue SIMD: null leaf maskWords should be 0"); - assert(Val_traits::get(result, i) == uint64_t(0) - && "cachedGetValue SIMD: null leaf result should be 0"); + // ---- SIMD portion (native __m256i uint16_t throughout — no aggregate ABI) ---- + static constexpr auto packed_tap = + static_cast( + (unsigned(dk) + 8u) + | ((unsigned(dj) + 8u) << 5) + | ((unsigned(di) + 8u) << 10)); + // SWAR expansion of the (x,y,z) local position of the center voxel. + // Inactive-lane values are garbage; the scalar tail below filters them + // out via the leafMask bitmask, so no sentinel / where-blend is needed. + const auto expanded = + ((vo | (vo << VoxelOffsetScalarT(4))) & VoxelOffsetT(kSwarXZMask)) + | ((vo << VoxelOffsetScalarT(2)) & VoxelOffsetT(kSwarYMask)); + const auto packed_sum = expanded + VoxelOffsetT(packed_tap); + + // Per-lane direction index (0..26) via the base-32 multiply trick (§8d). + // Stays in uint16_t — bits [10:14] of (v * 1129) lie entirely below bit 16, + // so the modular uint16_t product gives the same result as the full-width + // product for all valid inputs. No int32 widening → no _Fixed aggregate. + static constexpr uint16_t kSwarCarryMask = 0x6318u; + static constexpr uint16_t kDirMul = 1129u; + static constexpr uint16_t kDirMask = 31u; + const auto d_u16 = (((packed_sum & VoxelOffsetT(kSwarCarryMask)) + >> VoxelOffsetScalarT(3)) + * VoxelOffsetT(kDirMul) + >> VoxelOffsetScalarT(10)) + & VoxelOffsetT(kDirMask); + + // Per-lane 9-bit local offset in the destination leaf. + // NanoVDB leaf layout: offset = (destX << 6) | (destY << 3) | destZ. + // packed_sum bits: destX=[10:12], destY=[5:7], destZ=[0:2] + // output bits: destX=[6:8], destY=[3:5], destZ=[0:2] + const auto localOffset_u16 = + ((packed_sum >> VoxelOffsetScalarT(4)) & VoxelOffsetT(0x1C0u)) + | ((packed_sum >> VoxelOffsetScalarT(2)) & VoxelOffsetT(0x38u)) + | (packed_sum & VoxelOffsetT(0x07u)); + + // ---- Harvest SIMD → C arrays and scalar tail ---- + if constexpr (LaneWidth == 1) { + if (!leafMask) return; // inactive: leave dst[0] alone + const uint32_t leafID = mNeighborLeafIDs[uint32_t(d_u16)]; + if (leafID == kNullLeafID) { dst[0] = ScalarValueT(0); return; } + dst[0] = static_cast( + mFirstLeaf[leafID].getValue(uint32_t(localOffset_u16))); + } else { + alignas(32) uint16_t localOffset[LaneWidth]; + alignas(32) uint16_t neighborIdx[LaneWidth]; + util::store(localOffset_u16, localOffset); + util::store(d_u16, neighborIdx); + + // Convert SIMD leafMask → uint32_t bitmask once; then a single + // scalar loop over active lanes with no further SIMD in sight. + const uint32_t activeBits = util::to_bitmask(leafMask); + for (int lane = 0; lane < LaneWidth; ++lane) { + if (!((activeBits >> lane) & 1u)) continue; + const uint32_t leafID = mNeighborLeafIDs[neighborIdx[lane]]; + if (leafID == kNullLeafID) { + dst[lane] = ScalarValueT(0); + continue; } + dst[lane] = static_cast( + mFirstLeaf[leafID].getValue(localOffset[lane])); } -#endif } } @@ -541,6 +431,7 @@ class BatchAccessor Coord mCenterOrigin; uint32_t mProbedMask; uint32_t mNeighborLeafIDs[27]; // kNullLeafID when not probed or outside narrow band + const LeafT* const mFirstLeaf; // getFirstLeaf() — scalar-tail leaf lookup base const uint64_t* const mOffsetBase; // &getFirstLeaf()[0].data()->mOffset const uint64_t* const mPrefixBase; // &getFirstLeaf()[0].data()->mPrefixSum const uint64_t* const mMaskWordBase; // getFirstLeaf()[0].valueMask().words() diff --git a/nanovdb/nanovdb/util/BatchAccessor.md b/nanovdb/nanovdb/util/BatchAccessor.md index 9febb5b6d8..41851df786 100644 --- a/nanovdb/nanovdb/util/BatchAccessor.md +++ b/nanovdb/nanovdb/util/BatchAccessor.md @@ -393,7 +393,16 @@ for (int cx : {0,1,2}) for (int cy : {0,1,2}) for (int cz : {0,1,2}) { } ``` -### 8e. `cachedGetValue` gather pipeline — Steps 1–8 +### 8e. `cachedGetValue` gather pipeline — Steps 1–8 *(historical)* + +> **Note — this section describes the prior fully-SIMD design.** The current +> implementation uses a **hybrid SIMD → scalar-tail** design (see §8i): Step 1 +> (`d_vec`) plus a parallel local-offset extraction stay SIMD (native `__m256i` +> uint16 arithmetic with no aggregate ABI), then per-lane values are harvested +> into stack C arrays and the leaf lookup runs as a plain scalar loop calling +> `leaf.getValue(offset)` directly. Steps 2–8 below no longer appear in the +> source. The material is preserved here as the rationale behind the original +> SIMD gather chain and the baseline the hybrid was compared against. `cachedGetValue` recomputes `packed_sum` identically to `prefetch` (§8a), then runs the following fully-SIMD pipeline. All types are SIMD vectors of the indicated element @@ -899,6 +908,110 @@ to their own hot entry point; the attribute is safe and a no-op under Clang. This choice keeps the library's default codegen predictable and avoids forcing a 77 KB monolithic body on callers with smaller working sets. +### 8i. Hybrid SIMD → scalar-tail design *(current)* + +The findings of §8f/§8h motivated a different trade-off, which is what the +codebase now ships. + +**Where SIMD genuinely helps** (kept as SIMD): +- `prefetch()` — SWAR direction extraction over + `Simd` (32 B = one native `__m256i`), horizontal carry-bit + reductions, mask-bit identification of unique neighbor directions. + Amortizes the `probeLeaf` call over all 16 lanes and over every tap that + reaches the same direction. +- The *setup* half of `cachedGetValue`: SWAR expansion, `packed_sum`, base-32 + direction extraction (`d_u16`), and local-offset extraction + (`localOffset_u16`) from the packed layout. All of this is pure uint16 + SIMD arithmetic on a single `__m256i` — no aggregate ABI, no gathers, no + heterogeneous where-blends, no Simd.h helpers that GCC outlines. + +**Where SIMD was dragging us down** (now scalar): +- The gather chain (Steps 2–8 of §8e): 14 Simd.h helper calls per + `cachedGetValue` instantiation, operating on `_Fixed<16>` aggregates. This + is what produces 282 `vzeroupper` per batch on GCC without `flatten` (§8h). +- Scalar equivalents of the arithmetic (single `popcnt`, couple of scalar + loads from the target leaf, one `uint64_t` add) measure at **0.05 ns/tap** + when 18 taps × 16 lanes overlap freely on the load ports (§8 Legacy + decomposition — it's what `leaf.getValue(offset)` does internally anyway). + +**The boundary**: right after `d_u16` / `localOffset_u16` are computed. Two +`util::store` calls harvest them into stack `uint16_t[W]` C arrays; a +`util::to_bitmask` harvests the SIMD `leafMask` into a `uint32_t` bitmask. +The scalar tail is a one-liner per lane: + +```cpp +const uint32_t leafID = mNeighborLeafIDs[neighborIdx[lane]]; +if (leafID == kNullLeafID) { dst[lane] = 0; continue; } +dst[lane] = mFirstLeaf[leafID].getValue(localOffset[lane]); +``` + +**API change**: `cachedGetValue`'s output parameter is now +`ScalarValueT (&dst)[LaneWidth]` — a plain C array, one entry per lane — +instead of the old `Simd&` aggregate. Scalar lane writes +are a single `mov` with no mask round-trip, which is what eliminates the +18× `WhereExpression::operator=` outlined symbol. + +**StencilAccessor changes** (StencilAccessor.md §8.1): +- Storage: `Simd mIndices[SIZE]` → `uint64_t mIndices[SIZE][W]`, + made **public** (there's no work hidden behind the access). +- Return type of `moveTo()`: `SimdMask` → `void` (active-lane + information is `leafIndex[i] != UnusedLeafIndex`, already available to + the caller). +- Removed `getValue()` and `operator[]`; added + `static constexpr tapIndex()` for reorder-safe compile-time + named-tap access. + +**Public API of `StencilAccessor`**: zero `Simd<>` or `SimdMask<>` types. +Callers may SIMD-load tap rows from `mIndices[k]` with their own preferred +backend (`Simd::load(mIndices[k], element_aligned)`) or iterate +scalarly — we don't impose a choice. + +#### Perf comparison (same workload as §8h: 32 M ambient / 50% / 32 threads) + +| Variant | GCC 13 ns/vox | Clang 18 ns/vox | +|----------------------------------|--------------:|----------------:| +| Old SIMD path, no flatten | 7.5 | 4.3 | +| Old SIMD path, +flatten on moveTo| 3.7 | 4.3 | +| **Hybrid (current), no flatten** | **5.1** | **4.9** | +| Hybrid +flatten on moveTo | 4.8 | 4.8 | +| `LegacyStencilAccessor` | 5.5 | 6.7 | + +Without `flatten`, the hybrid is **31% faster than the old SIMD path on GCC** +(7.5 → 5.1) and beats scalar Legacy on both compilers. Compiler-sensitivity +collapses: GCC and Clang deliver within 0.2 ns/voxel of each other, +eliminating the 3× spread that §8f / §8h documented. + +The 4.8 ns/voxel asymptote with `flatten` on both compilers is consistent +with the scalar `popcnt` throughput bound (288 `popcnt/batch` ÷ 1 port ÷ +5 GHz = 57 ns/batch ÷ 16 voxels = 3.6 ns/voxel just for `popcnt`, plus +~1.2 ns/voxel of surrounding work). + +#### Cost of the refactor + +- GCC loses 1.4 ns/voxel vs the best previous configuration (SIMD + + `flatten(moveTo)` at 3.7 ns/vox). The SIMD popcount SWAR tree did real + work that scalar `popcnt` can't fully replace on port-1 throughput. +- Clang loses ~0.6 ns/voxel vs its previous 4.3 ns/vox. +- Both gains are recoverable by re-enabling `flatten` at the caller's + `moveTo` site (4.8 ns/vox on both compilers) — the shipped code just + doesn't require it by default. + +#### Cleanup of `Simd.h` + +With the gather chain gone, several helpers are no longer exercised by +`BatchAccessor`: +- `util::gather` / `util::gather_if` +- `util::simd_cast` for widening `u16 → i32`, `i32 → i64`, `u16 → u64` +- `util::simd_cast_if` +- `util::popcount` (vector SWAR) — replaced by scalar `leaf.getValue`'s + internal `popcnt` +- `util::WhereExpression` (heterogeneous form) + +These can be removed from `Simd.h` in a follow-up, subject to no external +caller using them. Added to support the hybrid: `util::store(v, p)` (a +uniform `store` shim that dispatches to `copy_to` on stdx and `store` on +the array backend). + --- ## 9. Relationship to Phase 1 Prototype diff --git a/nanovdb/nanovdb/util/Simd.h b/nanovdb/nanovdb/util/Simd.h index cf89d36f9d..1bc3afc2d6 100644 --- a/nanovdb/nanovdb/util/Simd.h +++ b/nanovdb/nanovdb/util/Simd.h @@ -129,6 +129,12 @@ inline bool none_of(SimdMask m) { return stdx::none_of(m); } template inline bool all_of(SimdMask m) { return stdx::all_of(m); } +// Store W lanes of v into p[0..W-1] (stdx calls this copy_to). +template +inline void store(Simd v, T* p, element_aligned_tag = {}) { + v.copy_to(p, element_aligned); +} + // Unmasked gather: result[i] = ptr[idx[i]] for all lanes. // IdxT may be int32_t or int64_t; the compiler selects the matching hardware // instruction (vpgatherdps/vpgatherdq for 32-bit idx, vpgatherqq for 64-bit idx). @@ -335,6 +341,12 @@ NANOVDB_SIMD_HOSTDEV bool all_of(SimdMask m) { bool r = true; for (int i = 0; i < W; i++) r &= m[i]; return r; } +// Store W lanes of v into p[0..W-1] (array-backend passthrough to member). +template +NANOVDB_SIMD_HOSTDEV void store(Simd v, T* p, element_aligned_tag = {}) { + v.store(p); +} + // Unmasked gather: result[i] = ptr[idx[i]] for all lanes. template NANOVDB_SIMD_HOSTDEV Simd gather(const T* __restrict__ ptr, Simd idx) { diff --git a/nanovdb/nanovdb/util/StencilAccessor.h b/nanovdb/nanovdb/util/StencilAccessor.h index eb562eb8b1..17f04f132a 100644 --- a/nanovdb/nanovdb/util/StencilAccessor.h +++ b/nanovdb/nanovdb/util/StencilAccessor.h @@ -38,6 +38,7 @@ #include #include +#include // std::memset #include #include #include @@ -123,44 +124,47 @@ class StencilAccessor using GridT = NanoGrid; // ------------------------------------------------------------------------- - // Type aliases — scalar/SIMD split (§5 of design doc) + // Private type aliases — only used inside moveTo(). + // + // These are the W-lane SIMD types that carry the input arrays through the + // straddling loop and the SWAR direction extraction. They do NOT appear + // in the public API: callers consume `mIndices` (raw uint64_t[SIZE][W]) + // directly, and `moveTo` returns `void` — active-lane information is read + // from `leafIndex[]` vs `UnusedLeafIndex` by the caller. // ------------------------------------------------------------------------- + using OffsetVec = std::conditional_t>; + using LeafIdVec = std::conditional_t>; + using LeafMaskVec = std::conditional_t>; - // Output index type: one Simd per tap. - using IndexVec = std::conditional_t>; - - // Voxel offset type: loaded from the voxelOffset[] array (uint16_t). - using OffsetVec = std::conditional_t>; - - // Leaf index type: loaded from the leafIndex[] array (uint32_t). - using LeafIdVec = std::conditional_t>; - - // Internal mask — derived from leafIndex[] comparisons (uint32_t domain). - // Passed to BatchAccessor::prefetch / cachedGetValue. - using LeafMaskVec = std::conditional_t>; - - // External mask — returned by moveTo; semantically over mIndices (uint64_t). - // Both LeafMaskVec and IndexMaskVec are W-bit masks; conversion is a - // boolean round-trip (see SimdMask converting constructor in Simd.h). - using IndexMaskVec = std::conditional_t>; - - // BatchAccessor parameterised with LeafMaskVec (prefetch/cachedGetValue domain). using BatchAcc = std::conditional_t, - BatchAccessor>; + BatchAccessor>; static constexpr int SIZE = int(std::tuple_size_v); static constexpr int HULL_SIZE = int(std::tuple_size_v); public: + // ------------------------------------------------------------------------- + // Public API — entirely free of Simd<>/SimdMask<> types. + // + // Storage layout: `mIndices[tap][lane]` is a plain uint64_t. Callers are + // free to SIMD-load it with whatever backend they choose + // (e.g. `Simd::load(stencilAcc.mIndices[k], element_aligned)`), + // iterate scalarly, or pass slices to downstream kernels — we don't + // impose a choice. + // + // Layout is part of the ABI: [SIZE][W] row-major. Changing it is + // a breaking change. + // ------------------------------------------------------------------------- + alignas(64) uint64_t mIndices[SIZE][W]; + // ------------------------------------------------------------------------- // Construction // // firstLeafID -- VBM block's starting leaf ID (vbm.hostFirstLeafID()[blockID]). // nExtraLeaves -- number of distinct center-leaf advances possible in this block - // (computed by the caller from the jumpMap). Used only as a - // debug-mode assert bound; not needed for correctness. - // See StencilAccessor.md §7 for removal instructions. + // (computed by the caller from the jumpMap). Debug-only bound + // on the straddling loop; not needed for correctness. // ------------------------------------------------------------------------- StencilAccessor(const GridT& grid, uint32_t firstLeafID, uint32_t nExtraLeaves) : mBatch(grid, firstLeafID) @@ -168,37 +172,42 @@ class StencilAccessor , mNExtraLeaves(nExtraLeaves) #endif { - (void)nExtraLeaves; // suppress unused-parameter warning in release builds + (void)nExtraLeaves; } // ------------------------------------------------------------------------- - // moveTo -- gather all tap indices for a W-wide batch of center voxels + // moveTo -- fill mIndices[0..SIZE-1][0..W-1] for a W-wide batch. + // + // leafIndex -- ptr to leafIndex[batchStart] (uint32_t from decodeInverseMaps) + // voxelOffset -- ptr to voxelOffset[batchStart] (uint16_t from decodeInverseMaps) // - // leafIndex -- ptr to leafIndex[batchStart] (uint32_t array from decodeInverseMaps) - // voxelOffset -- ptr to voxelOffset[batchStart] (uint16_t array from decodeInverseMaps) + // Active-lane semantics: a lane i is "active" iff + // leafIndex[i] != UnusedLeafIndex + // Active lanes receive their 18 tap indices in mIndices[k][i]. + // Inactive lanes are zeroed (NanoVDB background index). // - // Returns the initial active-lane mask (leafSlice != UnusedLeafIndex), widened - // to IndexMaskVec. Active lanes have valid results in mIndices[0..SIZE-1]. - // Inactive lanes hold 0 (NanoVDB background index). + // Caller pattern: + // stencilAcc.moveTo(leafIndex + bs, voxelOffset + bs); + // for (int i = 0; i < W; ++i) { + // if (leafIndex[bs + i] == UnusedLeafIndex) continue; + // ...stencilAcc.mIndices[k][i]... + // } // // See StencilAccessor.md §8 for the full straddling loop design. // ------------------------------------------------------------------------- - IndexMaskVec moveTo(const uint32_t* leafIndex, const uint16_t* voxelOffset) + void moveTo(const uint32_t* leafIndex, const uint16_t* voxelOffset) { - // Zero all tap slots — inactive lanes will hold 0 (background index). - zeroIndices(std::make_index_sequence{}); + // Zero the whole results buffer — inactive lanes stay 0. + std::memset(mIndices, 0, sizeof(mIndices)); - // Load this batch. + // Load the batch into SIMD registers for the SWAR / straddling logic. const LeafIdVec leafSlice = loadLeafIdVec(leafIndex); const OffsetVec voVec = loadOffsetVec(voxelOffset); // Initial active-lane mask (which lanes have real voxels). LeafMaskVec activeMask = (leafSlice != LeafIdVec(UnusedLeafIndex)); - // Save before the drain loop — this is what we return. - const IndexMaskVec resultMask = widenMask(activeMask); - - if (util::none_of(activeMask)) return resultMask; + if (util::none_of(activeMask)) return; #ifndef NDEBUG uint32_t nAdvances = 0; @@ -219,41 +228,36 @@ class StencilAccessor } // Prefetch hull — warms all neighbor-leaf directions the full - // stencil can reach, before any cachedGetValue is called. + // stencil can reach, before any cachedGetValue runs. prefetchHull(voVec, leafMask, std::make_index_sequence{}); - // Compute all tap indices and blend into mIndices. + // Fill all SIZE tap entries for the lanes in leafMask. calcTaps(voVec, leafMask, std::make_index_sequence{}); // Remove processed lanes. activeMask = activeMask & !leafMask; } - - return resultMask; } // ------------------------------------------------------------------------- - // getValue -- access tap result by compile-time coordinate + // tapIndex() -- compile-time tap lookup. + // + // Returns the slot in mIndices that corresponds to a named stencil tap, + // resolved at compile time against StencilT::Taps. A tap that is not in + // the stencil produces a static_assert. // - // Resolved entirely at compile time via the findIndex constexpr fold. - // Returns a const reference valid until the next moveTo() call. + // Usage (reorder-safe, zero runtime cost): + // auto& xm3 = stencilAcc.mIndices[SAccT::tapIndex<-3,0,0>()]; // ------------------------------------------------------------------------- template - const IndexVec& getValue() const + static constexpr int tapIndex() { constexpr int I = detail::findIndex( std::make_index_sequence{}); - static_assert(I >= 0, "StencilAccessor::getValue: tap not in stencil"); - return mIndices[I]; + static_assert(I >= 0, "StencilAccessor::tapIndex: tap not in stencil"); + return I; } - // ------------------------------------------------------------------------- - // operator[] -- indexed tap access (for generic iteration over all taps) - // - // No bounds check in release. Same lifetime as getValue. - // ------------------------------------------------------------------------- - const IndexVec& operator[](int i) const { return mIndices[i]; } - static constexpr int size() { return SIZE; } private: @@ -261,13 +265,6 @@ class StencilAccessor // Private helpers // ------------------------------------------------------------------------- - // Compile-time zero of all SIZE index slots. - template - void zeroIndices(std::index_sequence) - { - ((mIndices[Is] = IndexVec(uint64_t(0))), ...); - } - // Load LeafIdVec from a uint32_t pointer (scalar or SIMD). static LeafIdVec loadLeafIdVec(const uint32_t* p) { @@ -282,16 +279,6 @@ class StencilAccessor else return OffsetVec(p, util::element_aligned); } - // Widen LeafMaskVec (uint32_t domain) → IndexMaskVec (uint64_t domain). - // Both are W-bit masks; SimdMask has a converting constructor from - // SimdMask that copies the bool array element-by-element (Simd.h §B). - // The stdx backend uses a boolean round-trip (WhereExpression, Simd.h §A). - static IndexMaskVec widenMask(LeafMaskVec m) - { - if constexpr (W == 1) return m; - else return IndexMaskVec(m); - } - // Compile-time fold: prefetch all HULL_SIZE hull directions. template void prefetchHull(OffsetVec voVec, LeafMaskVec leafMask, std::index_sequence) @@ -304,24 +291,19 @@ class StencilAccessor >(voVec, leafMask), ...); } - // Compile-time fold: cachedGetValue for all SIZE taps, where-blend into mIndices. + // Compile-time fold: cachedGetValue for all SIZE taps, write directly into mIndices. + // No where-blend: cachedGetValue's scalar tail writes only leafMask-active + // lanes; lanes outside leafMask keep whatever was written by a previous + // straddling-loop iteration (or zero from the initial memset). template void calcTaps(OffsetVec voVec, LeafMaskVec leafMask, std::index_sequence) { - (blendOneTap(voVec, leafMask), ...); - } - - // Fetch one tap and blend its result into mIndices[I] for the active lanes. - // The where(leafMask, mIndices[I]) = tmp blend uses the heterogeneous - // where() overload from Simd.h: LeafMaskVec (uint32_t) applied to - // IndexVec (uint64_t). Both are W-bit masks; Simd.h handles the conversion. - template - void blendOneTap(OffsetVec voVec, LeafMaskVec leafMask) - { - using P = std::tuple_element_t; - IndexVec tmp(uint64_t(0)); - mBatch.template cachedGetValue(tmp, voVec, leafMask); - util::where(leafMask, mIndices[I]) = tmp; + using Taps = typename StencilT::Taps; + (mBatch.template cachedGetValue< + std::tuple_element_t::di, + std::tuple_element_t::dj, + std::tuple_element_t::dk + >(mIndices[Is], voVec, leafMask), ...); } // ------------------------------------------------------------------------- @@ -329,7 +311,6 @@ class StencilAccessor // ------------------------------------------------------------------------- BatchAcc mBatch; // owns neighbor-leaf cache, mCenterLeafID - IndexVec mIndices[SIZE]; // one vector per tap — output store #ifndef NDEBUG uint32_t mNExtraLeaves; // removable sanity bound on center-leaf advances diff --git a/nanovdb/nanovdb/util/StencilAccessor.md b/nanovdb/nanovdb/util/StencilAccessor.md index ee86340506..facf7e8a61 100644 --- a/nanovdb/nanovdb/util/StencilAccessor.md +++ b/nanovdb/nanovdb/util/StencilAccessor.md @@ -380,20 +380,63 @@ Both loops expand to zero-overhead compile-time instantiations: where `blendOneTap` calls `cachedGetValue` into a temporary and then `where`-blends into `mIndices[I]`. -### 8.1 GCC codegen note — `[[gnu::flatten]]` on `moveTo` - -Under GCC 13 + `-O3`, the default inliner outlines both the 14 Simd.h helpers -inside each `cachedGetValue` and the 18 per-tap `cachedGetValue` calls -themselves, producing ~282 `vzeroupper` transitions per 16-voxel batch and -making this whole SIMD pipeline measurably slower than the scalar -`LegacyStencilAccessor` oracle. Annotating `moveTo` with `[[gnu::flatten]]` -collapses the full call tree into a single ~77 KB inlined body, restoring -end-to-end performance from 7.5 ns/voxel to 3.7 ns/voxel (2×) and beating -Clang's 4.3 ns/voxel in the same test. The attribute is a no-op under Clang -(which inlines by default) and is safe to add, but the header does not apply -it by default — see `BatchAccessor.md` §8h for the measurement matrix and the -rationale for leaving it opt-in. Consumers that instantiate -`StencilAccessor` in hot GCC-compiled code paths should consider enabling it. +### 8.1 Hybrid SIMD → scalar-tail design and public API + +`StencilAccessor` uses the hybrid design documented in `BatchAccessor.md` +§8i. The straddling loop in `moveTo` and the SWAR / direction-extraction +portion of each tap are SIMD; `BatchAccessor::cachedGetValue` then harvests +per-lane direction and local-offset values into stack C arrays and runs a +scalar loop calling `leaf.getValue(offset)`. Each tap writes directly into +`mIndices[I][0..W-1]` — one scalar `mov` per active lane, no +mask-bool round-trip. + +#### Public API is Simd-free + +| Member | Type | +|--------|:-----| +| `mIndices` (public) | `alignas(64) uint64_t[SIZE][W]` — results buffer, populated by `moveTo()` | +| `moveTo(leafIndex*, voxelOffset*)` | returns `void` | +| `tapIndex()` (static constexpr) | `int` — compile-time tap slot lookup | +| `size()` (static constexpr) | `int` | + +Callers consume `mIndices` directly. Active-lane information comes from +`leafIndex[i] != UnusedLeafIndex` — the same sentinel that `decodeInverseMaps` +produces. No `SimdMask<>` or `Simd<>` appears in the API. + +```cpp +stencilAcc.moveTo(leafIndex + bs, voxelOffset + bs); +for (int i = 0; i < W; ++i) { + if (leafIndex[bs + i] == CPUVBM::UnusedLeafIndex) continue; + // named-tap access (compile-time, reorder-safe): + uint64_t idx_xm3 = stencilAcc.mIndices[SAccT::tapIndex<-3,0,0>()][i]; + // iteration: + for (int k = 0; k < SAccT::size(); ++k) + consume(stencilAcc.mIndices[k][i]); + // SIMD load of tap row using caller's own backend: + auto row = nanovdb::util::Simd(stencilAcc.mIndices[k], + nanovdb::util::element_aligned); +} +``` + +#### Layout is ABI + +`mIndices[SIZE][W]` row-major is part of the contract. Changing it (for +example to `[W][SIZE]` or to a SIMD aggregate) is a breaking change. The +choice matches how the scalar tail produces the data, so "what's written" +and "what's read" share a single authoritative layout. + +#### GCC codegen (short version) + +With the hybrid in place, neither compiler needs `[[gnu::flatten]]` to +reach reasonable performance. Measured at 32 M ambient voxels / 50% / 32 +threads on i9-285K Arrow Lake: GCC 5.1 ns/voxel, Clang 4.9 ns/voxel — +both beat the scalar `LegacyStencilAccessor` oracle (5.5 GCC, 6.7 Clang). +Adding `flatten` on `moveTo` closes the compiler gap to ~4.8 ns/voxel on +both; the 0.3 ns/voxel gain is not worth the 77 KB monolithic body for +default builds. Consumers that need peak GCC performance can still +annotate their own entry point. See `BatchAccessor.md` §8i for the full +perf matrix and the analysis of which operations were kept SIMD vs +scalarized. --- From c4b2e909761c6c6b8b58bbacb30eeae091bbb8e7 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Sun, 19 Apr 2026 12:21:08 -0500 Subject: [PATCH 34/60] StencilAccessor/BatchAccessor: hybrid SIMD -> scalar-tail cachedGetValue: decomposition infra MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds benchmarking infrastructure used to decompose the stencil-gather pipeline into its real cost components via perf PMU counters. None of this ships in the default code path; it lives alongside the hybrid design to support follow-up perf analysis and to preserve decomposition variants that might be reused later. BatchAccessor: - Add mFirstLeaf cached base pointer (grid.tree().getFirstLeaf()) so the scalar tail in cachedGetValue can do leaf lookups without going back through mGrid.tree() each time. - Add cachedGetValueInLeaf: benchmarking-only variant of cachedGetValue that wraps taps to the center leaf via a mod-8 SWAR mask (kSwarFieldMask = 0x1CE7). Used to isolate the hybrid's single-leaf floor cost with no cross-leaf variety. Requires (di,dj,dk) in [0,7] per axis. StencilAccessor: - Add moveToInLeaf: counterpart to moveTo that calls cachedGetValueInLeaf instead of cachedGetValue. prefetchHull is skipped (center leaf is always cached after construction/advance). Benchmarking only — results have no geometric meaning. ex_stencil_gather_cpu: - Add CLI flags: --pass= run one pass in isolation. Supported names: framing, decode, center-hit, stencil, degenerate, inleaf, legacy, legacy-branchless, all (default), verify. Needed for clean perf-stat attribution — the default harness runs every variant back-to-back and perf cannot restrict counters to a subrange. --threads= gate TBB parallelism via tbb::global_control. Single-threaded runs give cleaner perf attribution; taskset alone does not limit TBB's worker-thread count. - Add DegenerateStencil: 18 identical (0,0,0) taps. Under CSE the per-tap work collapses to 1 evaluation per lane — isolates the hybrid's pipeline framing cost when the compiler can factor out all tap work. - Add InLeafStencil: 18 distinct compile-time taps in [0,6] per axis. Combined with moveToInLeaf, exercises the full hybrid pipeline with 18 distinct per-tap work items that all hit the center leaf — isolates the "same-leaf no-CSE" floor. - Add four new timed passes (with anti-DCE checksums): degenerateUs DegAccT .moveTo() inLeafUs InLeafAccT.moveToInLeaf() framingUs Legacy loop with no accessor call centerHitUs Legacy loop with 18 distinct same-leaf coords (CSE-defeated) legacyBranchlessUs Legacy loop with leaf.getValue() replaced by the unconditional formula mOffset + prefix + popcount(maskWord & mask-1) — drops the valueMask.isOn branch, giving a 3x speedup. - decodeUs baseline already existed; kept. - Each pass is guarded by wantPass(name) so --pass= runs exactly one variant in isolation. - Output extended: per-pass ns/voxel lines + a Legacy cost decomposition block (framing / cache+leaf / tree walk). Architectural summary: the decomposition revealed that the true CPU bottleneck is unpredictable branch mispredicts on the valueMask.isOn() check inside LeafNode::getValue(offset), not L1 pressure or tree-walk latency. Detailed perf matrix, revised attribution, and follow-up implications are documented in BatchAccessor.md §8j and StencilAccessor.md §8.2 (separate commit). Signed-off-by: Efty Sifakis --- .../stencil_gather_cpu.cpp | 300 +++++++++++++++++- nanovdb/nanovdb/util/BatchAccessor.h | 63 ++++ nanovdb/nanovdb/util/StencilAccessor.h | 58 ++++ 3 files changed, 408 insertions(+), 13 deletions(-) diff --git a/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp b/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp index 6234739711..e8911a63c6 100644 --- a/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp +++ b/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp @@ -49,9 +49,11 @@ #include #include #include +#include // std::unique_ptr #include #include // std::accumulate (checksum) #include +#include // ============================================================ // Constants and type aliases @@ -69,6 +71,55 @@ using CPUVBM = nanovdb::tools::VoxelBlockManager; using SAccT = nanovdb::StencilAccessor; using LegacyAccT = nanovdb::LegacyStencilAccessor; +// Decomposition-only stencil: 18 taps all at (0,0,0). Measures the hybrid +// StencilAccessor's floor cost when no tap crosses a leaf boundary and every +// lookup hits the center leaf. Subtracting this from the Weno5 run isolates +// the cross-leaf overhead — BUT the 18 identical compile-time taps give the +// compiler a large CSE opportunity, biasing the number downward. +struct DegenerateStencil { + using Taps = std::tuple< + nanovdb::StencilPoint<0,0,0>, nanovdb::StencilPoint<0,0,0>, + nanovdb::StencilPoint<0,0,0>, nanovdb::StencilPoint<0,0,0>, + nanovdb::StencilPoint<0,0,0>, nanovdb::StencilPoint<0,0,0>, + nanovdb::StencilPoint<0,0,0>, nanovdb::StencilPoint<0,0,0>, + nanovdb::StencilPoint<0,0,0>, nanovdb::StencilPoint<0,0,0>, + nanovdb::StencilPoint<0,0,0>, nanovdb::StencilPoint<0,0,0>, + nanovdb::StencilPoint<0,0,0>, nanovdb::StencilPoint<0,0,0>, + nanovdb::StencilPoint<0,0,0>, nanovdb::StencilPoint<0,0,0>, + nanovdb::StencilPoint<0,0,0>, nanovdb::StencilPoint<0,0,0> + >; + // Empty Hull: prefetchHull becomes a no-op; center leaf is always cached + // by BatchAccessor's constructor / advance(). + using Hull = std::tuple<>; +}; +using DegAccT = nanovdb::StencilAccessor; + +// CSE-resistant in-leaf stencil: 18 distinct compile-time taps spanning the +// leaf's 8^3 footprint (all axes, 6 tap offsets in [0..6] per axis). Used +// via StencilAccessor::moveToInLeaf, which applies (voxel_local + tap) mod 8 +// to the center voxel — guaranteeing every tap accesses the center leaf +// while touching distinct mValueMask words across taps and across voxels. +// This isolates the hybrid's single-leaf floor without the CSE bias that +// DegenerateStencil suffers from. +struct InLeafStencil { + using Taps = std::tuple< + // x spans 0..6 (hits mValueMask words 0..6 depending on voxel's local x) + nanovdb::StencilPoint<0,0,0>, nanovdb::StencilPoint<1,0,0>, + nanovdb::StencilPoint<2,0,0>, nanovdb::StencilPoint<3,0,0>, + nanovdb::StencilPoint<4,0,0>, nanovdb::StencilPoint<5,0,0>, + // y spans 1..6 (different destY positions within a word) + nanovdb::StencilPoint<0,1,0>, nanovdb::StencilPoint<0,2,0>, + nanovdb::StencilPoint<0,3,0>, nanovdb::StencilPoint<0,4,0>, + nanovdb::StencilPoint<0,5,0>, nanovdb::StencilPoint<0,6,0>, + // z spans 1..6 + nanovdb::StencilPoint<0,0,1>, nanovdb::StencilPoint<0,0,2>, + nanovdb::StencilPoint<0,0,3>, nanovdb::StencilPoint<0,0,4>, + nanovdb::StencilPoint<0,0,5>, nanovdb::StencilPoint<0,0,6> + >; + using Hull = std::tuple<>; // moveToInLeaf skips prefetchHull entirely +}; +using InLeafAccT = nanovdb::StencilAccessor; + // ============================================================ // Test domain generation (mirrors vbm_host_cuda.cpp) // ============================================================ @@ -256,8 +307,16 @@ static void runPrototype( static void runPerf( const GridT* grid, - const nanovdb::tools::VoxelBlockManagerHandle& vbmHandle) + const nanovdb::tools::VoxelBlockManagerHandle& vbmHandle, + const std::string& passFilter = "all") { + // wantPass() returns true if this pass should run under the current filter. + // Supported names: "decode", "stencil", "degenerate", "inleaf", "framing", + // "center-hit", "legacy". "all" runs everything. + auto wantPass = [&](const char* name) { + return passFilter == "all" || passFilter == name; + }; + const LeafT* firstLeaf = grid->tree().getFirstNode<0>(); const uint64_t nVoxels = grid->activeVoxelCount(); const uint32_t nBlocks = (uint32_t)vbmHandle.blockCount(); @@ -286,7 +345,8 @@ static void runPerf( // ---- decodeInverseMaps-only baseline (both paths pay this cost) ---- // Anti-DCE: XOR one uint64_t per block derived from leafIndex[] + voxelOffset[] // so the compiler can't elide the decode work. - const double decodeUs = timeForEach([&] { + double decodeUs = 0.0; + if (wantPass("decode")) decodeUs = timeForEach([&] { nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), [&](const nanovdb::util::Range1D& range) { alignas(64) uint32_t leafIndex[BlockWidth]; @@ -308,9 +368,12 @@ static void runPerf( }); // ---- StencilAccessor ---- + double stencilUs = 0.0; + uint64_t stencilChecksum = 0; + if (wantPass("stencil")) { std::fill(sums.begin(), sums.end(), uint64_t(0)); - const double stencilUs = timeForEach([&] { + stencilUs = timeForEach([&] { nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), [&](const nanovdb::util::Range1D& range) { alignas(64) uint32_t leafIndex[BlockWidth]; @@ -345,9 +408,99 @@ static void runPerf( }); }); - const uint64_t stencilChecksum = + stencilChecksum = std::accumulate(sums.begin(), sums.end(), uint64_t(0), [](uint64_t a, uint64_t b) { return a ^ b; }); + } // end wantPass("stencil") + + // ---- Hybrid floor: DegenerateStencil (18 taps all at (0,0,0)) ---- + double degenerateUs = 0.0; + uint64_t degenerateChecksum = 0; + if (wantPass("degenerate")) { + std::fill(sums.begin(), sums.end(), uint64_t(0)); + degenerateUs = timeForEach([&] { + nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), + [&](const nanovdb::util::Range1D& range) { + alignas(64) uint32_t leafIndex[BlockWidth]; + alignas(64) uint16_t voxelOffset[BlockWidth]; + + for (size_t bID = range.begin(); bID != range.end(); ++bID) { + CPUVBM::decodeInverseMaps( + grid, firstLeafID[bID], + &jumpMap[bID * CPUVBM::JumpMapLength], + firstOffset + bID * BlockWidth, + leafIndex, voxelOffset); + + int nExtraLeaves = 0; + for (int w = 0; w < CPUVBM::JumpMapLength; ++w) + nExtraLeaves += nanovdb::util::countOn( + jumpMap[bID * CPUVBM::JumpMapLength + w]); + + DegAccT degAcc(*grid, firstLeafID[bID], (uint32_t)nExtraLeaves); + uint64_t* bs = sums.data() + bID * BlockWidth; + + for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { + degAcc.moveTo(leafIndex + batchStart, voxelOffset + batchStart); + for (int i = 0; i < SIMDw; ++i) { + if (leafIndex[batchStart + i] == CPUVBM::UnusedLeafIndex) continue; + uint64_t s = 0; + for (int k = 0; k < DegAccT::size(); ++k) + s += degAcc.mIndices[k][i]; + bs[batchStart + i] = s; + } + } + } + }); + }); + degenerateChecksum = + std::accumulate(sums.begin(), sums.end(), uint64_t(0), + [](uint64_t a, uint64_t b) { return a ^ b; }); + } // end wantPass("degenerate") + + // ---- Hybrid floor (CSE-resistant): 18 distinct taps wrapped to center leaf ---- + double inLeafUs = 0.0; + uint64_t inLeafChecksum = 0; + if (wantPass("inleaf")) { + std::fill(sums.begin(), sums.end(), uint64_t(0)); + inLeafUs = timeForEach([&] { + nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), + [&](const nanovdb::util::Range1D& range) { + alignas(64) uint32_t leafIndex[BlockWidth]; + alignas(64) uint16_t voxelOffset[BlockWidth]; + + for (size_t bID = range.begin(); bID != range.end(); ++bID) { + CPUVBM::decodeInverseMaps( + grid, firstLeafID[bID], + &jumpMap[bID * CPUVBM::JumpMapLength], + firstOffset + bID * BlockWidth, + leafIndex, voxelOffset); + + int nExtraLeaves = 0; + for (int w = 0; w < CPUVBM::JumpMapLength; ++w) + nExtraLeaves += nanovdb::util::countOn( + jumpMap[bID * CPUVBM::JumpMapLength + w]); + + InLeafAccT inLeafAcc(*grid, firstLeafID[bID], (uint32_t)nExtraLeaves); + uint64_t* bs = sums.data() + bID * BlockWidth; + + for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { + inLeafAcc.moveToInLeaf( + leafIndex + batchStart, voxelOffset + batchStart); + for (int i = 0; i < SIMDw; ++i) { + if (leafIndex[batchStart + i] == CPUVBM::UnusedLeafIndex) continue; + uint64_t s = 0; + for (int k = 0; k < InLeafAccT::size(); ++k) + s += inLeafAcc.mIndices[k][i]; + bs[batchStart + i] = s; + } + } + } + }); + }); + inLeafChecksum = + std::accumulate(sums.begin(), sums.end(), uint64_t(0), + [](uint64_t a, uint64_t b) { return a ^ b; }); + } // end wantPass("inleaf") // ---- Legacy cost decomposition variants ---- // (a) "framing only" — Legacy loop structure, no accessor call (anti-DCE writes use li+k). @@ -362,8 +515,10 @@ static void runPerf( // Cache + leaf-lookup per voxel ≈ center-hit − framing. // Framing per voxel ≈ framing. + double framingUs = 0.0; + if (wantPass("framing")) { std::fill(sums.begin(), sums.end(), uint64_t(0)); - const double framingUs = timeForEach([&] { + framingUs = timeForEach([&] { nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), [&](const nanovdb::util::Range1D& range) { alignas(64) uint32_t leafIndex[BlockWidth]; @@ -394,9 +549,12 @@ static void runPerf( } }); }); + } // end wantPass("framing") + double centerHitUs = 0.0; + if (wantPass("center-hit")) { std::fill(sums.begin(), sums.end(), uint64_t(0)); - const double centerHitUs = timeForEach([&] { + centerHitUs = timeForEach([&] { nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), [&](const nanovdb::util::Range1D& range) { alignas(64) uint32_t leafIndex[BlockWidth]; @@ -433,10 +591,15 @@ static void runPerf( }); }); + } // end wantPass("center-hit") + // ---- LegacyStencilAccessor ---- + double legacyUs = 0.0; + uint64_t legacyChecksum = 0; + if (wantPass("legacy")) { std::fill(sums.begin(), sums.end(), uint64_t(0)); - const double legacyUs = timeForEach([&] { + legacyUs = timeForEach([&] { nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), [&](const nanovdb::util::Range1D& range) { alignas(64) uint32_t leafIndex[BlockWidth]; @@ -468,9 +631,94 @@ static void runPerf( }); }); - const uint64_t legacyChecksum = + legacyChecksum = + std::accumulate(sums.begin(), sums.end(), uint64_t(0), + [](uint64_t a, uint64_t b) { return a ^ b; }); + } // end wantPass("legacy") + + // ---- Legacy branchless: same as legacy but skip the leaf.getValue isOn branch ---- + // Replaces `leaf.getValue(offset)` (which branches on valueMask.isOn(offset)) + // with the unconditional formula: + // mOffset + prefix9(wordIdx) + popcount(maskWord & ((1<); + + legacyBranchlessUs = timeForEach([&] { + nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), + [&](const nanovdb::util::Range1D& range) { + alignas(64) uint32_t leafIndex[BlockWidth]; + alignas(64) uint16_t voxelOffset[BlockWidth]; + auto acc = grid->getAccessor(); + uint64_t* bs0 = sums.data(); + + for (size_t bID = range.begin(); bID != range.end(); ++bID) { + CPUVBM::decodeInverseMaps( + grid, firstLeafID[bID], + &jumpMap[bID * CPUVBM::JumpMapLength], + firstOffset + bID * BlockWidth, + leafIndex, voxelOffset); + + uint64_t* bs = bs0 + bID * BlockWidth; + + for (int i = 0; i < BlockWidth; ++i) { + if (leafIndex[i] == CPUVBM::UnusedLeafIndex) continue; + const uint16_t vo = voxelOffset[i]; + const uint32_t li = leafIndex[i]; + const nanovdb::Coord cOrigin = firstLeaf[li].origin(); + const int lx = (vo >> 6) & 7, ly = (vo >> 3) & 7, lz = vo & 7; + const nanovdb::Coord center = cOrigin + nanovdb::Coord(lx, ly, lz); + + uint64_t s = 0; + auto addTap = [&](int di, int dj, int dk) { + const nanovdb::Coord c = center + nanovdb::Coord(di, dj, dk); + const LeafT* leaf = acc.probeLeaf(c); + if (!leaf) return; // tap outside narrow band (still branches, + // but well-predicted for active-region voxels) + const uint32_t offset = (uint32_t(c[0] & 7) << 6) + | (uint32_t(c[1] & 7) << 3) + | uint32_t(c[2] & 7); + const uint32_t wordIdx = offset >> 6; + const uint64_t bit = uint64_t(1) << (offset & 63); + const uint64_t maskWord = leaf->valueMask().words()[wordIdx]; + // prefix9 extract — cmov'd by the compiler, not a branch-miss source + const uint64_t prefix = (wordIdx > 0) + ? (leaf->data()->mPrefixSum >> (9u * (wordIdx - 1u))) & 511u + : uint64_t(0); + // UNCONDITIONAL: no isOn test. For OFF voxels this computes a + // non-zero value but does no branch. + s += leaf->data()->mOffset + prefix + + __builtin_popcountll(maskWord & (bit - 1)); + }; + + // Unroll all 18 WENO5 taps via the compile-time tuple. + [&](std::index_sequence) { + (addTap( + std::tuple_element_t::di, + std::tuple_element_t::dj, + std::tuple_element_t::dk + ), ...); + }(std::make_index_sequence{}); + + bs[i] = s; + } + } + }); + }); + + legacyBranchlessChecksum = std::accumulate(sums.begin(), sums.end(), uint64_t(0), [](uint64_t a, uint64_t b) { return a ^ b; }); + } // end wantPass("legacy-branchless") std::printf("\nEnd-to-end stencil gather (%u blocks, %lu active voxels):\n", nBlocks, nVoxels); @@ -479,9 +727,18 @@ static void runPerf( std::printf(" StencilAccessor : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", stencilUs / 1e3, stencilUs * 1e3 / double(nVoxels), (stencilUs - decodeUs) / 1e3, stencilChecksum); + std::printf(" Degenerate (18×center): %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", + degenerateUs / 1e3, degenerateUs * 1e3 / double(nVoxels), + (degenerateUs - decodeUs) / 1e3, degenerateChecksum); + std::printf(" InLeaf (18 distinct) : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", + inLeafUs / 1e3, inLeafUs * 1e3 / double(nVoxels), + (inLeafUs - decodeUs) / 1e3, inLeafChecksum); std::printf(" LegacyStencilAccessor : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", legacyUs / 1e3, legacyUs * 1e3 / double(nVoxels), (legacyUs - decodeUs) / 1e3, legacyChecksum); + std::printf(" Legacy branchless : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", + legacyBranchlessUs / 1e3, legacyBranchlessUs * 1e3 / double(nVoxels), + (legacyBranchlessUs - decodeUs) / 1e3, legacyBranchlessChecksum); // Decomposition of LegacyStencilAccessor's ns/voxel: // framing = no accessor call @@ -511,15 +768,24 @@ static void runPerf( int main(int argc, char** argv) { try { - int ambient_voxels = 1024 * 1024; - float occupancy = 0.5f; + int ambient_voxels = 1024 * 1024; + float occupancy = 0.5f; + std::string passFilter = "all"; // --pass= + int nThreads = 0; // --threads=, 0 = TBB default if (argc > 1) ambient_voxels = std::stoi(argv[1]); if (argc > 2) occupancy = std::stof(argv[2]); + for (int i = 3; i < argc; ++i) { + std::string a = argv[i]; + if (a.rfind("--pass=", 0) == 0) passFilter = a.substr(7); + else if (a.rfind("--threads=", 0) == 0) nThreads = std::stoi(a.substr(10)); + } occupancy = std::max(0.0f, std::min(1.0f, occupancy)); std::cout << "ambient_voxels = " << ambient_voxels << "\n" - << "occupancy = " << occupancy << "\n"; + << "occupancy = " << occupancy << "\n" + << "pass = " << passFilter << "\n" + << "threads = " << (nThreads > 0 ? std::to_string(nThreads) : std::string("(TBB default)")) << "\n"; auto coords = generateDomain(ambient_voxels, occupancy); std::cout << "Active voxels generated: " << coords.size() << "\n"; @@ -551,8 +817,16 @@ int main(int argc, char** argv) std::cout << "VBM blocks=" << vbmHandle.blockCount() << " (BlockWidth=" << BlockWidth << ")\n\n"; - runPrototype(grid, vbmHandle); - runPerf(grid, vbmHandle); + // TBB thread-count limit for perf measurements. + std::unique_ptr tbbLimit; + if (nThreads > 0) { + tbbLimit = std::make_unique( + tbb::global_control::max_allowed_parallelism, (size_t)nThreads); + } + + if (passFilter == "all" || passFilter == "verify") + runPrototype(grid, vbmHandle); + runPerf(grid, vbmHandle, passFilter); } catch (const std::exception& e) { std::cerr << "Exception: " << e.what() << "\n"; diff --git a/nanovdb/nanovdb/util/BatchAccessor.h b/nanovdb/nanovdb/util/BatchAccessor.h index 995a82c0a6..8a074c26cd 100644 --- a/nanovdb/nanovdb/util/BatchAccessor.h +++ b/nanovdb/nanovdb/util/BatchAccessor.h @@ -415,6 +415,69 @@ class BatchAccessor } } + // ------------------------------------------------------------------------- + // cachedGetValueInLeaf -- benchmarking variant that forces all + // taps to stay in the center leaf via mod-8 wrap. + // + // Purpose: measure the hybrid pipeline's floor cost when all 18 taps + // access the SAME leaf, with distinct per-tap / per-lane positions (so + // the compiler can't CSE across taps, and we still exercise different + // mValueMask words and prefix-sum slots). The result is semantically + // target_local = (voxel_local + (di,dj,dk)) mod 8 + // with target always in the center leaf (direction code 0). + // + // Implementation: same SWAR + harvest + scalar-tail pipeline as + // cachedGetValue, but after `packed_sum = expanded + packed_tap` we mask + // with kSwarFieldMask = 0x1CE7 to discard all inter-field carry bits, + // which is exactly `x mod 8 | y mod 8 | z mod 8` in the packed layout. + // + // Requires di, dj, dk in [0, 7]. No prefetch call needed; the center + // leaf is always in mNeighborLeafIDs[13] from construction/advance. + // ------------------------------------------------------------------------- + template + void cachedGetValueInLeaf(ScalarValueT (&dst)[LaneWidth], + VoxelOffsetT vo, + PredicateT leafMask) const + { + static_assert(di >= 0 && di < 8 && dj >= 0 && dj < 8 && dk >= 0 && dk < 8, + "cachedGetValueInLeaf: tap offsets must be in [0, 7] per axis"); + + static constexpr auto packed_tap = + static_cast( + unsigned(dk) + | (unsigned(dj) << 5) + | (unsigned(di) << 10)); + const auto expanded = + ((vo | (vo << VoxelOffsetScalarT(4))) & VoxelOffsetT(kSwarXZMask)) + | ((vo << VoxelOffsetScalarT(2)) & VoxelOffsetT(kSwarYMask)); + // Mask off inter-field carry bits → per-axis mod-8 wrap; always center. + static constexpr uint16_t kSwarFieldMask = 0x1CE7u; + const auto packed_sum = + (expanded + VoxelOffsetT(packed_tap)) & VoxelOffsetT(kSwarFieldMask); + + // Extract 9-bit local offset (same layout as cachedGetValue). + const auto localOffset_u16 = + ((packed_sum >> VoxelOffsetScalarT(4)) & VoxelOffsetT(0x1C0u)) + | ((packed_sum >> VoxelOffsetScalarT(2)) & VoxelOffsetT(0x38u)) + | (packed_sum & VoxelOffsetT(0x07u)); + + if constexpr (LaneWidth == 1) { + if (!leafMask) return; + dst[0] = static_cast( + mFirstLeaf[mCenterLeafID].getValue(uint32_t(localOffset_u16))); + } else { + alignas(32) uint16_t localOffset[LaneWidth]; + util::store(localOffset_u16, localOffset); + const uint32_t activeBits = util::to_bitmask(leafMask); + const LeafT* const leaf = &mFirstLeaf[mCenterLeafID]; // hoisted + for (int lane = 0; lane < LaneWidth; ++lane) { + if (!((activeBits >> lane) & 1u)) continue; + dst[lane] = static_cast( + leaf->getValue(localOffset[lane])); + } + } + } + private: // Compute the world-space origin of the leaf at direction bit d from center. // bit(dx,dy,dz) = (dx+1)*9 + (dy+1)*3 + (dz+1); leaf stride = 8 per axis. diff --git a/nanovdb/nanovdb/util/StencilAccessor.h b/nanovdb/nanovdb/util/StencilAccessor.h index 17f04f132a..2689457ebe 100644 --- a/nanovdb/nanovdb/util/StencilAccessor.h +++ b/nanovdb/nanovdb/util/StencilAccessor.h @@ -239,6 +239,52 @@ class StencilAccessor } } + // ------------------------------------------------------------------------- + // moveToInLeaf -- benchmarking variant: identical to moveTo except that + // each tap is wrapped to the center leaf via (localVoxel + tap) mod 8. + // + // Purpose: measure the hybrid pipeline's floor cost with 18 distinct + // compile-time taps that all access the SAME leaf, preventing both the + // cross-leaf L1 pressure and the compiler CSE of identical taps. All + // StencilT::Taps offsets must be in [0, 7] per axis. + // + // NOT for production use -- results have no geometric meaning; they + // just exercise the hybrid's code path under a controlled cache regime. + // ------------------------------------------------------------------------- + void moveToInLeaf(const uint32_t* leafIndex, const uint16_t* voxelOffset) + { + std::memset(mIndices, 0, sizeof(mIndices)); + + const LeafIdVec leafSlice = loadLeafIdVec(leafIndex); + const OffsetVec voVec = loadOffsetVec(voxelOffset); + + LeafMaskVec activeMask = (leafSlice != LeafIdVec(UnusedLeafIndex)); + + if (util::none_of(activeMask)) return; + +#ifndef NDEBUG + uint32_t nAdvances = 0; +#endif + + while (util::any_of(activeMask)) { + const LeafMaskVec leafMask = + activeMask & (leafSlice == LeafIdVec(mBatch.centerLeafID())); + + if (util::none_of(leafMask)) { + mBatch.advance(mBatch.centerLeafID() + 1); +#ifndef NDEBUG + assert(++nAdvances <= mNExtraLeaves); +#endif + continue; + } + + // No prefetchHull — all targets are the center leaf by construction. + calcTapsInLeaf(voVec, leafMask, std::make_index_sequence{}); + + activeMask = activeMask & !leafMask; + } + } + // ------------------------------------------------------------------------- // tapIndex() -- compile-time tap lookup. // @@ -306,6 +352,18 @@ class StencilAccessor >(mIndices[Is], voVec, leafMask), ...); } + // Benchmark-only counterpart: forces all taps into the center leaf. + template + void calcTapsInLeaf(OffsetVec voVec, LeafMaskVec leafMask, std::index_sequence) + { + using Taps = typename StencilT::Taps; + (mBatch.template cachedGetValueInLeaf< + std::tuple_element_t::di, + std::tuple_element_t::dj, + std::tuple_element_t::dk + >(mIndices[Is], voVec, leafMask), ...); + } + // ------------------------------------------------------------------------- // Members // ------------------------------------------------------------------------- From 8b37dbdd46e969d4ec487ac5a51c151b5780401a Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Sun, 19 Apr 2026 12:22:12 -0500 Subject: [PATCH 35/60] BatchAccessor/StencilAccessor: document perf investigation and revised attribution MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Captures the findings of the perf-counter investigation made possible by the decomposition infrastructure in the prior commit. Preserves prior claims as history (with targeted "Revision note" annotations pointing to the superseding section), adds the new authoritative measurements and decomposition, and updates the current-API sections to match the shipped hybrid design. BatchAccessor.md: - §8g, §8h, §8i: prepend "Revision note" blocks pointing to §8j. The original text is preserved intact — only a short header note flags which claims were subsequently refined. Affected claims: * §8g cycle budget model: gather chain assumed to dominate; actually pipelined away by OoO, real dominant cost is branch mispredicts. * §8h "multi-leaf L1 pressure accounts for cross-leaf overhead": refuted, L1 miss rate is flat across all variants. * §8i "27-leaf cache eliminates tree walks" magnitude overstated; measured savings are ~0.3 ns/voxel, not ~3-4 ns/voxel. - §8j (new, ~230 lines): full perf-counter chronicle. Nine subsections: 8j.1 Motivation 8j.2 Methodology (--pass/--threads flags, PMU setup, P-core pinning) 8j.3 Measurement matrix for 6 variants (IPC, branch-miss rate, L1 miss rate, ns/voxel on a single P-core single-threaded) 8j.4 Identifying valueMask.isOn(offset) as the real source 8j.5 Branchless experiment (legacy-branchless pass) — 3x speedup on Legacy from 5.6 to 2.0 ns/voxel at 32 threads 8j.6 Revised 5.4 ns/voxel attribution: ~3.6 isOn mispredicts, ~0.75 leaf-local work, ~0.3 tree-walk diff, ~0 L1 pressure 8j.7 NANOVDB_USE_INTRINSICS is a no-op on GCC 13 at -O3 -march=native (SWAR pattern-matched to popcnt) — enable for portability 8j.8 Architectural implications (branchless LeafNode::getValue proposal, revised halo value proposition, hybrid rationale) 8j.9 Historical correction log — single-table index of revised claims with their source sections - §10 Status: add the hybrid refactor and §8j findings to Completed; add "branchless LeafNode::getValue" as the top Remaining item — the single biggest cheap CPU-side speedup available. StencilAccessor.md: - §8.2 (new): summary of the PMU finding pointing to BatchAccessor.md §8j. Contains the headline measurement matrix, branchless-experiment delta, revised 5.4 ns/voxel attribution, and consequences for design decisions. - §9: rewrite to document the current API (tapIndex() + public mIndices[SIZE][W]). Prior getValue<>()/operator[] API was removed in the hybrid refactor; that evolution is noted in-section. - §10 Caller-side usage pattern: rewrite to match current API — moveTo returns void, active-lane check via leafIndex[i] != UnusedLeafIndex, three direct mIndices access idioms (scalar iteration, SIMD row load, compile-time named-tap access). - §11 Ownership summary: mIndices[SIZE] -> [SIZE][W]; tap-fold row updated to reflect direct writes (no where-blend). - §12 Design decisions 1 and 3: preserve original rationale, append "Evolution" / "Revised" notes explaining the shift to void moveTo return (decision 1) and removal of operator[] (decision 3) in favour of public mIndices + tapIndex<>(). Decisions 2, 4-6 unchanged. Net: +550 lines of documentation capturing the investigation as a chronicle, with every revised claim traceable to both the superseded prior section and the measurement that refined it. Signed-off-by: Efty Sifakis --- nanovdb/nanovdb/util/BatchAccessor.md | 308 +++++++++++++++++++++++- nanovdb/nanovdb/util/StencilAccessor.md | 242 +++++++++++++++---- 2 files changed, 493 insertions(+), 57 deletions(-) diff --git a/nanovdb/nanovdb/util/BatchAccessor.md b/nanovdb/nanovdb/util/BatchAccessor.md index 41851df786..f861d3f141 100644 --- a/nanovdb/nanovdb/util/BatchAccessor.md +++ b/nanovdb/nanovdb/util/BatchAccessor.md @@ -651,6 +651,18 @@ AVX2 popcount pattern and the likely replacement for `popcount64` in `Simd.h`. ### 8g. Cycle budget and architectural comparison +> **Revision note (see §8j).** The cycle-budget table below models the +> *historical* fully-SIMD `cachedGetValue` (§8e) and predicts a ~55-cycle +> critical path dominated by the gather chain. That pipeline no longer +> ships (hybrid refactor, §8i), and even for the scalar-tail path PMU +> measurement shows that the dominant cost is **not** gather/pointer-chase +> latency but rather **`valueMask.isOn(offset)` branch mispredicts** +> (§8j). The "4–10× CPU speedup over scalar" framing below remains +> directionally correct (the hybrid does still beat Legacy), but the +> magnitude is ~1.05× on 32-thread WENO5, not 4×. Use the §8j matrix as +> the authoritative measurement; treat this section as design-rationale +> history. + #### `cachedGetValue` critical path (Clang 18 + stdx + `-march=native`, W=16) | Step | Work | Cumulative cycles | @@ -733,6 +745,15 @@ on CPU. ### 8h. End-to-end perf: outlining, `[[gnu::flatten]]`, and W=8 +> **Revision note (see §8j).** The end-to-end measurements and `[[gnu::flatten]]` +> findings in this section are correct. The *attribution* of cross-leaf cost to +> "multi-leaf L1 pressure" — which appeared here and in the original analysis — +> was **wrong**. `perf` counter measurements later showed that L1 miss rates are +> flat across all variants (~0.4 %) and that the dominant cross-leaf cost is +> actually **branch-mispredict stalls on the `valueMask.isOn(offset)` check** +> inside `LeafNode::getValue(offset)`. See §8j for the full +> perf-counter investigation and revised decomposition. + §8f measured `cachedGetValue` as a standalone symbol. This section measures the **full WENO5 pipeline end-to-end** — `StencilAccessor::moveTo` driving 18 taps × 128 voxels/block × 131072 blocks across 32 TBB threads — and reveals a much @@ -910,6 +931,25 @@ a 77 KB monolithic body on callers with smaller working sets. ### 8i. Hybrid SIMD → scalar-tail design *(current)* +> **Revision note (see §8j).** The hybrid design and the perf-matrix numbers +> in this section are correct. Two *claims* in the "Cost of the refactor" / +> "Cleanup" subsections were subsequently refined: +> +> 1. The cross-leaf overhead (`Stencil − InLeaf ≈ 0.9 ns/voxel`) was attributed +> here to "multi-leaf L1 pressure". `perf` showed L1 miss rates are flat; +> the real source is additional unpredictable branches in the cross-leaf +> path. +> 2. The architectural claim that the 27-leaf neighbor cache eliminates full +> tree walks (§8i "Not applied" discussion) is correct structurally, but the +> *magnitude* of that savings is much smaller than implied. Measured via +> controlled decomposition: ~**0.3 ns/voxel** — about 6 % of Legacy's total +> 5.4 ns/voxel — not the majority of the "4.4 ns/voxel cross-leaf cost" this +> section's table implies. See §8j for the quantified breakdown. +> +> The hybrid design itself remains the right shipped choice; the refactor's +> primary win is Simd-free public API and compiler-portable performance, +> not the cache lookup. + The findings of §8f/§8h motivated a different trade-off, which is what the codebase now ships. @@ -1012,6 +1052,237 @@ caller using them. Added to support the hybrid: `util::store(v, p)` (a uniform `store` shim that dispatches to `copy_to` on stdx and `store` on the array backend). +### 8j. `perf`-counter investigation — what actually bottlenecks the CPU path + +This section records the results of a direct PMU-counter investigation that +replaced several rounds of structural reasoning and cycle-budget estimation +(§8e–§8i) with measurements. **It revises or refutes several earlier claims** +and identifies the single biggest lever for CPU-side speedup of any +`ValueOnIndex` stencil gather. + +#### 8j.1 Motivation + +By §8i we had three working hypotheses for where the ~5.4 ns/voxel of Legacy +(and ~5.1 ns/voxel of the hybrid) was spent: + +1. **Tree-walk pointer chases** on leaf-cache misses (~25 % of taps cross + leaves in WENO5). +2. **L1 pressure** from touching up to 6 neighbour leaves' `mValueMask` / + `mPrefixSum` data per voxel. +3. **Gather-chain latency** in the old SIMD pipeline (largely mitigated by + the hybrid refactor — §8i). + +All three were structural guesses, anchored by the cycle-budget table in §8g +and by assembly reading. None had been validated with hardware counters. + +#### 8j.2 Methodology + +Added two CLI knobs to `ex_stencil_gather_cpu`: + +- `--pass=` — runs exactly one of the timed variants + (`framing`, `decode`, `center-hit`, `legacy`, `legacy-branchless`, + `degenerate`, `inleaf`, `stencil`). Needed because the default harness + runs every variant back-to-back, and `perf stat` cannot attribute counters + to a subrange. +- `--threads=` — gates TBB parallelism via `tbb::global_control`. Needed + because `perf` event multiplexing and hybrid-CPU attribution is cleaner + single-threaded on a single P-core. + +Setup: i9-285K Arrow Lake (8 P-cores + 16 E-cores, no HT). Pin to +`taskset -c 0` for the P-core. Lower `kernel.perf_event_paranoid` to 1. +Baseline events: `cycles, instructions, branch-instructions, branch-misses, +L1-dcache-loads, L1-dcache-load-misses`. Workload: 32 M ambient voxels / +50 % occupancy (16.7 M active). Build: GCC 13.3 at `-O3 -march=native` +with `NANOVDB_USE_INTRINSICS=ON` (though see §8j.7 for why this flag is a +no-op on this toolchain). + +#### 8j.3 Measurement matrix (single P-core, `--threads=1`) + +| Variant | ns/voxel | IPC | branch-miss | L1 miss | branch-misses / voxel | +|---------|---------:|----:|------------:|--------:|----------------------:| +| framing (no accessor call) | 3.2 | 2.52 | 3.15 % | 1.41 % | 2.05 | +| center-hit × 18 (legacy, same leaf, 18 distinct coords) | 19.0 | **4.80** | **0.84 %** | 0.47 % | 2.38 | +| Degenerate (hybrid, 18 × (0,0,0) — compiler CSE'd) | 29.0 | **4.02** | **0.75 %** | 0.41 % | 2.22 | +| InLeaf (hybrid, 18 distinct same-leaf, no CSE) | 76.6 | **1.45** | **9.87 %** | 0.68 % | 23.1 | +| Stencil (hybrid, WENO5 cross-leaf) | 96.9 | **1.53** | **8.75 %** | 0.46 % | 24.1 | +| Legacy (WENO5, 1-slot path cache) | 99.2 | **1.98** | **8.85 %** | 0.40 % | 26.7 | + +Three immediate observations from this matrix: + +1. **L1-dcache-load-misses is flat** across all six variants (0.40 – 0.68 %, + absolute counts 25.8 – 28.3 M). The multi-leaf L1 pressure hypothesis is + **falsified**. Even WENO5's 6-leaf working set stays L1-resident. +2. **Branch-miss rate splits cleanly into two groups**: "good" (0.75 – 0.84 %) + and "bad" (8.75 – 9.87 %). The split is not along tree-walk lines — + InLeaf has **no** tree walks (it is same-leaf by construction) yet lands + in the "bad" group with the highest miss rate of all. +3. **IPC collapses from ~4.5 to ~1.5** between the two groups. A backend + throughput difference of 3× is far too large to be attributable to any + single cache effect. + +#### 8j.4 Identifying the real source — the `valueMask.isOn(offset)` branch + +Every path that ends at `LeafNode::getValue(offset)` evaluates: + +```cpp +uint32_t n = i >> 6; +uint64_t w = mValueMask.words()[n], mask = 1ull << (i & 63u); +if (!(w & mask)) return 0; // ← unpredictable branch +uint64_t sum = mOffset + util::countOn(w & (mask - 1u)); +if (n--) sum += mPrefixSum >> (9u * n) & 511u; +return sum; +``` + +For our 50 %-occupancy workload, tap positions land on ON vs OFF bits with +roughly 60/40 frequency (spatially correlated but not perfectly). **This +branch is fundamentally unpredictable.** Its cost compounds: ~288 taps per +16-voxel batch × ~25 mispredicts per voxel × ~15-cycle mispredict penalty = +the dominant stall in both the hybrid and Legacy paths. + +Why do Degenerate and center-hit escape it? + +- **Degenerate**: 18 identical compile-time taps produce 18 identical values + per lane. GCC CSEs the entire per-lane computation (including the `isOn` + check) down to 1 evaluation + 18 stores of the same value. One branch per + lane survives instead of 18. +- **center-hit (legacy)**: after the tight loop is fully inlined, GCC emits + the `isOn`-guarded return as a **branchless `cmov`** pattern. Verified by + disassembly: no conditional jump in the hot path. This is not a general + property — it happens because `acc.getValue(coord)` in its minimal form + exposes a clean `?:`-equivalent to the compiler. In the hybrid's scalar + tail (larger function body, per-lane loop, harvest-buffer loads), GCC + keeps the `isOn` as a conditional jump. + +#### 8j.5 Branchless experiment — quantifying the `isOn` cost + +Added a `legacy-branchless` variant that replaces the `leaf.getValue(offset)` +call with the unconditional formula inlined at the call site: + +```cpp +// in place of `leaf.getValue(offset)` with isOn check: +const uint32_t offset = (c[0]&7)<<6 | (c[1]&7)<<3 | c[2]&7; +const uint32_t wordIdx = offset >> 6; +const uint64_t bit = 1ull << (offset & 63); +const uint64_t word = leaf->valueMask().words()[wordIdx]; +const uint64_t prefix = (wordIdx > 0) + ? (leaf->data()->mPrefixSum >> (9 * (wordIdx - 1))) & 511 + : 0; +s += leaf->data()->mOffset + prefix + __builtin_popcountll(word & (bit - 1)); +// No isOn check. Produces a non-zero "wrong" value for OFF voxels — +// so the checksum will NOT match — but wall-clock and PMU counters are clean. +``` + +Results: + +| Metric | Legacy (with `isOn`) | Legacy branchless | Δ | +|-----------------------|---------------------:|------------------:|------:| +| ns/voxel (32 thread) | 5.6 | **2.0** | −3.6 | +| ns/voxel (1 P-core) | 103.7 | **33.2** | −70.5 | +| IPC | 1.98 | **4.29** | 2.2× | +| branch-miss rate | 8.07 % | **1.67 %** | −5× | +| branch-misses / voxel | 27 | **4.6** | −6× | +| L1 miss rate | 0.36 % | 0.48 % | ~0 | +| instructions / voxel | 2646 | 2416 | −9 % | + +**The single change of removing the `isOn` branch recovers a 3× speedup on +Legacy end-to-end.** It accounts for the entire IPC collapse. The tree +walk inside `acc.probeLeaf()` is preserved in this variant, so the speedup +is not from avoiding tree walks — it is from removing the pipeline stalls +caused by mispredicting one branch per tap. + +#### 8j.6 Revised attribution of Legacy WENO5's 5.4 ns/voxel + +| Component | ns/voxel | How isolated | +|-----------------------------------------------|---------:|:-------------| +| Framing (decodeInverseMaps, loop, anti-DCE) | 0.25 | measured standalone | +| Leaf-local `getValue` work (loads + `popcnt`) | 0.75 | center-hit × 18 minus framing | +| `valueMask.isOn` branch mispredicts (~24/voxel × ~15 cy) | **~3.6** | Legacy minus Legacy-branchless | +| Full tree walk vs 27-leaf cache (stencil minus legacy) | **~0.3** | Stencil minus Legacy (or Legacy-branchless minus Stencil-branchless, if both existed) | +| Multi-leaf L1 pressure | ~0 | measured: L1 miss rate flat | +| **Total** | **~5.4** | | + +The earlier framing — that "tree walks and L1 pressure dominate" — was +wrong. Both turn out to be minor. The entire ~78 % of Legacy's cost that +§8h attributed to "cross-leaf overhead" is actually **~80 % `isOn` mispredicts, +~10 % real tree-walk work, ~10 % other**. + +#### 8j.7 `NANOVDB_USE_INTRINSICS` is a no-op on GCC 13 at `-O3 -march=native` + +`util::countOn(uint64_t)` in `nanovdb/util/Util.h` gates +`__builtin_popcountll` behind `NANOVDB_USE_INTRINSICS`; the fallback is a +SWAR popcount that uses a magic multiply (`0x0101010101010101`). Verified +by `objdump`: the compiled binary contains 178 `popcnt` instructions and +only 1 occurrence of the SWAR magic multiply. GCC's peephole pattern +matcher at `-O3` recognises the SWAR shape and replaces it with hardware +`popcnt` whether or not `NANOVDB_USE_INTRINSICS` is defined. This is +brittle (depends on GCC version, flags, and code layout); the macro should +be enabled explicitly in production builds for portability, but none of +the perf numbers in this section change when it is toggled. + +#### 8j.8 Architectural implications + +1. **BatchAccessor's 27-leaf cache addresses ~6 % of the total cost.** Its + architectural value over the scalar `DefaultReadAccessor`'s 1-slot cache + is real but modest on this workload. The neighbour cache eliminates the + full root-to-leaf traversal on every cross-leaf tap (§8i, confirmed + structurally), but the wall-clock saving is ~0.3 ns/voxel — dominated by + OoO pipelining of otherwise-serial pointer chases. + +2. **The biggest cheap CPU win available is branchless + `LeafNode::getValue(offset)` in NanoVDB proper.** Rewriting + that function (perhaps ~15 lines, preserving semantics for OFF voxels via + a branchless arithmetic gate) would give every stencil-gather caller — + Legacy, hybrid, HaloStencilAccessor, any future variant — a 2–3× speedup + on CPU. Proposed form, sketched below, keeps OFF-returns-0 semantics: + + ```cpp + // sketch, not tested: + __hostdev__ uint64_t getValue(uint32_t i) const { + const uint32_t n = i >> 6; + const uint64_t w = mValueMask.words()[n]; + const uint64_t bit = 1ull << (i & 63u); + const uint64_t mask = bit - 1u; + const uint64_t on = (w & bit) ? ~0ull : 0ull; // cmov via explicit ternary + const uint64_t pfx = n ? ((mPrefixSum >> (9u * (n - 1u))) & 511u) : 0ull; + return on & (mOffset + pfx + util::countOn(w & mask)); + } + ``` + (The `on` gate pattern compiles to a `test`+`cmov` on GCC; the + `leaf.getValue` call pays one predictable branch instead of one + unpredictable one. Needs benchmarking to confirm the optimiser doesn't + refold it into a conditional jump.) + +3. **HaloStencilAccessor's value proposition is validated but smaller than + advertised.** Its core architectural advantage (precomputed uint64 + indices per tap position, so stencil queries are unconditional indexed + loads) naturally eliminates the `isOn` branch. But a branchless + `LeafNode::getValue` would capture most of the same win without needing + the halo-buffer infrastructure. The halo still wins on absolute perf + (zero per-tap work at query time), but the delta over a branchless + leaf lookup is more like ~0.5–1 ns/voxel than the "sub-2 ns/voxel + territory" framed earlier. + +4. **The hybrid `StencilAccessor`'s design rationale needs a small rewrite.** + The shipped hybrid design (§8i) is still the right API choice (Simd-free + public surface, compiler-portable perf) — but the justification is not + "it beats the gather chain's L1 pressure" (there is none); it is "it + matches the compiler's natural inlining / vectorisation model for this + workload and eliminates the outlining/vzeroupper pathology (§8h)." The + gain over Legacy WENO5 is marginal (~0.3 ns/voxel) because both pay the + same dominant `isOn` mispredict cost; the hybrid's real value emerges + only if and when `leaf.getValue` is made branchless. + +#### 8j.9 Historical correction log + +| Earlier claim | Source | Revised to | +|------------------------------------------------------------|:-----------|:-----------| +| "Tree-walk latency is the critical path" (cycle-budget) | §8g | OoO absorbs most of it; isOn mispredicts dominate. | +| "Multi-leaf L1 pressure accounts for ~0.9 ns/voxel cross-leaf overhead" | §8h, §8i | L1 miss rate is flat; the 0.9 ns/voxel is mostly isOn mispredicts shared with same-leaf InLeaf. | +| "Tree walks cost ~78 % of Legacy's time (4.4 ns/voxel)" | §8h (implicit); my thread claim | Real tree-walk cost is ~0.3 ns/voxel; the 4.4 ns/voxel was mostly isOn mispredicts. | +| "Degenerate ~1.7 ns/voxel is the hybrid's floor" | my thread claim | Degenerate is heavily CSE-biased; real floor is InLeaf at ~4.2 ns/voxel, of which ~3.5 is isOn mispredicts. | +| "`NANOVDB_USE_INTRINSICS` matters for popcount-heavy paths" | general assumption | No-op on GCC `-O3 -march=native`: SWAR → popcnt pattern match. Enable for portability anyway. | +| "27-leaf cache is the architectural win of BatchAccessor" | §8i "Cost of the refactor" | Cache delta is ~0.3 ns/voxel. Real wins are the Simd-free API and flatten-free compiler portability (§8i). | + --- ## 9. Relationship to Phase 1 Prototype @@ -1060,14 +1331,43 @@ the array backend). 86% but regresses GCC end-to-end due to per-batch framing overhead. Attributes **not applied** in the shipped code; see §8h "Not applied" note. +- **Hybrid SIMD → scalar-tail refactor (§8i)**: shipped. `BatchAccessor::cachedGetValue` + now keeps the SIMD SWAR setup and harvests per-lane direction / local-offset + into C arrays for a plain scalar tail calling `leaf.getValue(offset)`. + Public API of `StencilAccessor` is Simd-free; performance is within + ~0.3 ns/voxel of the old flatten-forced path, but compiler-portable. + +- **PMU-counter investigation (§8j)**: validated the above empirically and + refuted two earlier working hypotheses. Specifically: + - L1 miss rate is flat across all variants (~0.4 %) — **multi-leaf L1 + pressure is not a factor**. + - The dominant cost (~65 % of Legacy's 5.4 ns/voxel) is branch-mispredict + stalls on the **`valueMask.isOn(offset)` check** inside + `LeafNode::getValue(offset)`. + - A branchless reformulation of that call recovers a 3× speedup + (5.6 → 2.0 ns/voxel on 32 threads) with IPC rising from 1.98 to 4.29. + - Tree-walk elimination by the 27-leaf cache saves ~0.3 ns/voxel, not + the ~3 – 4 ns/voxel implied by §8h/§8i. + - `NANOVDB_USE_INTRINSICS` is a no-op on GCC 13 at `-O3 -march=native` + (SWAR `util::countOn` is pattern-matched to hardware `popcnt`). Enable + it in the build anyway for portability. + ### Remaining +- **Branchless `LeafNode::getValue(offset)` in NanoVDB** (§8j). + The single biggest available CPU-side speedup for any stencil caller. A + ~15-line rewrite that preserves the OFF-returns-0 semantics via an + arithmetic mask gate instead of a conditional `return 0` would give + Legacy, the hybrid, HaloStencilAccessor, and any future variant a 2–3× + end-to-end speedup on 32-thread WENO5 workloads. Needs benchmarking to + confirm GCC/Clang don't refold the gate back into a branch. + - **`[[gnu::always_inline]]` on `Simd.h` helpers** (§8f) vs **`[[gnu::flatten]]` on StencilAccessor-style entry points** (§8h): - two candidate approaches to restore GCC inlining. The flatten path was - measured end-to-end (2× speedup); the always_inline path was measured only - on the standalone `cachedGetValue` symbol. Decide which to ship once a - consumer of StencilAccessor exists in the production build. + two candidate approaches to restore GCC inlining. Mostly superseded + by the hybrid refactor (§8i) and the branchless-leaf opportunity + (§8j); leave open in case later callers reintroduce the outlining + pathology. - **`vpshufb`-based `popcount` in `Simd.h`:** replace `popcount64` SWAR tree with nibble-LUT + `vpsadbw` pattern (§8f); reduces the out-of-line body from 88 to ≈40 diff --git a/nanovdb/nanovdb/util/StencilAccessor.md b/nanovdb/nanovdb/util/StencilAccessor.md index facf7e8a61..17efc3f40e 100644 --- a/nanovdb/nanovdb/util/StencilAccessor.md +++ b/nanovdb/nanovdb/util/StencilAccessor.md @@ -438,59 +438,181 @@ annotate their own entry point. See `BatchAccessor.md` §8i for the full perf matrix and the analysis of which operations were kept SIMD vs scalarized. +### 8.2 What actually bottlenecks the CPU path — `valueMask.isOn` mispredicts + +A PMU-counter investigation (`BatchAccessor.md` §8j) replaced several rounds +of structural reasoning with hardware measurements. It refutes two +hypotheses that had shaped earlier design discussions and identifies the +one lever that dominates CPU performance. + +#### What we measured + +On a single P-core of an i9-285K (Arrow Lake, 8 P + 16 E, GCC 13 at +`-O3 -march=native`, 32 M-voxel / 50 %-occupancy workload), comparing +per-variant PMU counters for every benchmarking pass exposed via +`ex_stencil_gather_cpu --pass=`: + +| Variant | ns/voxel | IPC | branch-miss | L1 miss | +|---------|---------:|----:|------------:|--------:| +| Degenerate (18 × (0,0,0), CSE'd) | 29.0 | 4.02 | **0.75 %** | 0.41 % | +| center-hit × 18 (Legacy, same-leaf, `cmov`'d) | 19.0 | 4.80 | **0.84 %** | 0.47 % | +| InLeaf (hybrid, 18 distinct same-leaf, no CSE) | 76.6 | 1.45 | **9.87 %** | 0.68 % | +| Stencil (hybrid WENO5 cross-leaf) | 96.9 | 1.53 | **8.75 %** | 0.46 % | +| Legacy (WENO5, full tree walks) | 99.2 | 1.98 | **8.85 %** | 0.40 % | + +#### The two big findings + +1. **L1-dcache miss rates are flat across all variants (~0.4–0.7 %).** + Multi-leaf L1 pressure — the earlier narrative for why cross-leaf taps + cost so much — is **not a factor** on this workload. The neighbour + leaves' `mValueMask` / `mPrefixSum` data stays L1-resident throughout a + VBM block. + +2. **Branch-miss rates split cleanly into two groups**, and the split is + not along tree-walk lines. InLeaf has *no* tree walks (it wraps taps + mod 8 to the centre leaf by construction) but still lands in the "bad" + group at 9.87 % — higher than Legacy. The common factor is the + **`valueMask.isOn(offset)` conditional** inside + `LeafNode::getValue(offset)`: + + ```cpp + if (!(w & mask)) return 0; // data-dependent, ~50/50 outcome, unpredictable + ``` + + Every per-tap leaf lookup in the "bad" group — the hybrid's scalar tail, + Legacy's `legacyAcc[k]`, InLeaf's `cachedGetValueInLeaf` — routes through + this branch. Degenerate escapes it via CSE (18 identical taps collapse + to 1 evaluation). center-hit escapes it because GCC's inliner in that + tight loop emits the guarded return as a branchless `cmov` — an + optimiser accident, not a general property. + +#### Branchless experiment + +A `legacy-branchless` variant that replaces `leaf.getValue(offset)` with +the unconditional formula inlined at the call site (see §8j.5) recovers a +**3× speedup on Legacy**: from 5.6 ns/voxel to 2.0 ns/voxel at 32 threads, +IPC from 1.98 to 4.29, branch-miss rate from 8.07 % to 1.67 %. The +tree-walk machinery (`acc.probeLeaf()`) is preserved in that variant; the +only thing removed is the single `isOn` branch per tap. That single +change accounts for ~65 % of Legacy's total wall-clock time. + +#### Revised attribution of Legacy's 5.4 ns/voxel + +| Component | ns/voxel | +|-----------|---------:| +| Framing (`decodeInverseMaps`, loop, anti-DCE) | 0.25 | +| Leaf-local `getValue` work (loads + `popcnt`) | 0.75 | +| **`valueMask.isOn` branch mispredicts** (~24/voxel × ~15 cy) | **~3.6** | +| Tree walk vs 27-leaf cache differential | ~0.3 | +| Multi-leaf L1 pressure | ~0 | +| **Total** | **~5.4** | + +Earlier versions of this section attributed the bulk of Legacy's cost to +tree-walk pointer chases and multi-leaf L1 traffic; both turned out to +be minor. The hybrid `StencilAccessor` matches Legacy (~5.1 ns/voxel) +because both pay the same dominant `isOn` mispredict cost. + +#### Consequence for architectural decisions + +- **The shipped hybrid design is the right API choice** (Simd-free public + surface, compiler-portable) but its wall-clock edge over Legacy is + marginal (~0.3 ns/voxel), not the ~3 ns/voxel originally implied. +- **The cheap architectural win is a branchless + `LeafNode::getValue(offset)` in NanoVDB** — ~15 lines + that would speed up every stencil gather caller (Legacy, hybrid, + HaloStencilAccessor, future variants) by ~3×. +- **HaloStencilAccessor's value proposition is validated**: its precomputed + uint64 index buffer naturally eliminates `isOn` branches by never evaluating + them. The speedup over branchless-leaf is smaller than previously + framed (~0.5–1 ns/voxel rather than sub-2 ns/voxel territory), but + still real. Worth building for the absolute-perf cases. + +See `BatchAccessor.md` §8j for the full measurement matrix, methodology, +correction log relative to §8g/§8h/§8i, and the branchless-experiment +source. + --- -## 9. `getValue()` — tap access by coordinate +## 9. `tapIndex()` — compile-time slot lookup, `mIndices[][]` access + +> **API evolution.** Earlier drafts of this document described a +> `getValue() const → const IndexVec&` member and an +> `operator[](int) → const IndexVec&` accessor. Both were removed in the +> hybrid refactor (§8.1). The results buffer is now a plain public 2D +> C array; callers pick their own access pattern. The change aligns with +> the hybrid's Simd-free public API — no `Simd<>` or `SimdMask<>` type +> appears in the class's public interface. ```cpp +// Storage — public, part of the ABI: +alignas(64) uint64_t mIndices[SIZE][W]; + +// Compile-time slot lookup (reorder-safe, zero runtime cost): template -const IndexVec& getValue() const { - constexpr int I = findIndex( +static constexpr int tapIndex() { + constexpr int I = detail::findIndex( std::make_index_sequence{}); - static_assert(I >= 0, "StencilAccessor::getValue: tap not in stencil"); - return mIndices[I]; + static_assert(I >= 0, "StencilAccessor::tapIndex: tap not in stencil"); + return I; } -``` - -**Inverse map** (`findIndex`): a `constexpr` fold over all `SIZE` taps, comparing -`(di,dj,dk)` against each `StencilPoint`. O(N) compile-time evaluations — -negligible for realistic stencil sizes. Resolved entirely at compile time; the -resulting `I` is a compile-time constant used as an array index. -**`static_assert`**: catches invalid tap coordinates at compile time with a clear -message. Same safety guarantee as OpenVDB stencil's bounds check. +// Iteration bound: +static constexpr int size() { return SIZE; } +``` -**Lifetime**: the returned reference is valid only until the next `moveTo` call. -The caller must not cache the reference across batches. +**Inverse map** (`detail::findIndex`): a `constexpr` fold over all `SIZE` +taps, comparing `(DI,DJ,DK)` against each `StencilPoint`. O(N) compile-time +evaluations — negligible for realistic stencil sizes. Resolved entirely at +compile time; `tapIndex<-3,0,0>()` compiles to an integer literal. -**Indexed access** — for kernels that iterate over all taps generically: +**`static_assert`**: catches invalid tap coordinates at compile time with a +clear message. Same safety guarantee as OpenVDB stencil's bounds check. -```cpp -const IndexVec& operator[](int i) const { return mIndices[i]; } -``` +**Lifetime**: `mIndices` is valid only until the next `moveTo` call. The +caller must not cache references across batches. -Public, no bounds check in release. Same lifetime caveat as `getValue`. +**Why expose `mIndices` directly** (rather than a method that returns it): +the results buffer is plain data — no lazy work, no layout translation, no +invariants to enforce. Hiding it behind an accessor would pretend +otherwise. Direct access also lets callers choose their SIMD load pattern +(or scalar iteration) without our API imposing one. --- ## 10. Caller-side usage pattern ```cpp -// Construct once per VBM block -StencilAccessor stencil(grid, vbm.firstLeafID(blockID), nExtraLeaves); - -for (int b = 0; b < nBatches; ++b) { - auto active = stencil.moveTo(leafIndex + b*W, voxelOffset + b*W); - if (util::none_of(active)) continue; - - // Access by coordinate — compile-time slot resolution - auto idx_m3 = stencil.getValue<-3, 0, 0>(); // Simd - auto idx_m2 = stencil.getValue<-2, 0, 0>(); - // ... feed into WENO kernel alongside sidecar value fetches +// Construct once per VBM block. +StencilAccessor stencil( + grid, vbm.firstLeafID(blockID), nExtraLeaves); + +// Active-lane information comes from decodeInverseMaps's UnusedLeafIndex +// sentinel — the same source that StencilAccessor uses internally. +for (int bs = 0; bs < BlockWidth; bs += W) { + stencil.moveTo(leafIndex + bs, voxelOffset + bs); // returns void + + // Option A: scalar iteration across lanes and taps. + for (int i = 0; i < W; ++i) { + if (leafIndex[bs + i] == UnusedLeafIndex) continue; + for (int k = 0; k < StencilAccessor::size(); ++k) { + consume(stencil.mIndices[k][i]); // uint64_t + } + } + + // Option B: SIMD load of an entire tap row (caller picks backend/width). + auto row_m3 = util::Simd( + stencil.mIndices[stencilAccT::tapIndex<-3, 0, 0>()], + util::element_aligned); + + // Option C: compile-time named tap access for a handful of taps. + const uint64_t& xm3 = stencil.mIndices[stencilAccT::tapIndex<-3,0,0>()][i]; } // stencil destroyed here (end of block scope) ``` +No `Simd<>` or `SimdMask<>` types appear in the public API. The caller +uses its own SIMD backend (or none) to consume `mIndices`. + --- ## 11. Ownership summary @@ -506,8 +628,8 @@ for (int b = 0; b < nBatches; ++b) { | `leafMask` computation | `StencilAccessor` (derived inside `moveTo`) | | Straddling loop | `StencilAccessor` | | Hull prefetch sequencing | `StencilAccessor` | -| Tap fold and `where`-blend | `StencilAccessor` | -| `mIndices[SIZE]` storage and zeroing | `StencilAccessor` (zeroed at top of each `moveTo`) | +| Tap fold (writes directly into `mIndices[Is]`) | `StencilAccessor::calcTaps` | +| `mIndices[SIZE][W]` storage and zeroing | `StencilAccessor` (public member; `std::memset` at top of each `moveTo`) | | `nExtraLeaves` debug bound | `StencilAccessor` (`#ifndef NDEBUG` member; removable) | | Center-leaf lifetime (block scope) | Caller | @@ -515,28 +637,42 @@ for (int b = 0; b < nBatches; ++b) { ## 12. Design decisions (all resolved) -1. **`moveTo` return type — `IndexMaskVec` by value.** - The initial `activeMask = (leafSlice != UnusedLeafIndex)` is saved before the - straddling loop drains it to zero, widened from `LeafMaskVec` (uint32_t) to - `IndexMaskVec` (uint64_t), and returned. This gives the caller a mask that is - semantically aligned with the uint64_t `mIndices` data. The returned mask has - two simultaneous readings: which lanes held valid voxels (not padding sentinels), - and which lanes of `mIndices[k]` contain valid stencil indices. These are the - same predicate. No member copy is kept — the mask is consumed at the call site. +> **Evolution.** Decisions 1 and 3 below have been superseded by the +> hybrid refactor (§8.1): `moveTo` now returns `void`, and `operator[]` / +> `getValue<>()` were removed in favour of public `mIndices` access + +> `tapIndex<>()`. The original rationales are preserved for historical +> context; the current API is §9's. + +1. **`moveTo` return type — ~~`IndexMaskVec` by value~~ `void` (revised §8.1).** + *Original rationale:* The initial + `activeMask = (leafSlice != UnusedLeafIndex)` was saved before the + straddling loop drains it to zero, widened from `LeafMaskVec` (uint32_t) + to `IndexMaskVec` (uint64_t), and returned. This gave the caller a mask + semantically aligned with the uint64_t `mIndices` data. + *Revised:* `moveTo` now returns `void`. The active-lane information is + redundant: callers already have `leafIndex[]` from `decodeInverseMaps` + and the same `UnusedLeafIndex` sentinel that `StencilAccessor` uses + internally. Returning the mask duplicated state and forced a + heterogeneous `SimdMask` → `SimdMask` widening with + a boolean round-trip (§8h) — all for zero information gain. Removing + it also eliminated the last `SimdMask<>` type from the public API. 2. **Inactive-lane `mIndices` values — zeroed at top of `moveTo`.** - `mIndices[0..SIZE-1]` is set to `IndexVec(0)` at the start of every `moveTo` - call. Index 0 is the NanoVDB IndexGrid "not found / background" sentinel, so - inactive lanes yield a well-defined background index rather than stale data. - The cost is `SIZE` × W zero-writes per call (~36 YMM stores for WENO5 W=16), - which is negligible. - -3. **`operator[]` — public, const-ref, no bounds check.** - ```cpp - const IndexVec& operator[](int i) const { return mIndices[i]; } - ``` - For kernels that iterate over all taps generically. Same lifetime as - `getValue`: valid only until the next `moveTo` call. + `mIndices` is set to zero (via `std::memset`) at the start of every + `moveTo` call. Index 0 is the NanoVDB IndexGrid "not found / background" + sentinel, so inactive lanes yield a well-defined background index rather + than stale data. The cost is a single `memset` of `SIZE * W * 8` bytes + per call (2304 B for WENO5 W=16), which stays in L1 and pipelines under + other work. + +3. **~~`operator[]` — public, const-ref, no bounds check~~ removed (revised §9).** + *Original:* `const IndexVec& operator[](int i) const { return mIndices[i]; }` + for kernels that iterate over all taps generically. + *Revised:* `mIndices` is now a public member (§9); direct indexing + replaces both `operator[](int)` and `getValue()`. Named-tap + access is via the `tapIndex()` static constexpr slot lookup. + This change is consistent with the hybrid's Simd-free public API — no + method can now return a `Simd<>` or `SimdMask<>` reference. 4. **`StencilT` representation — `std::tuple...>` for both `Taps` and `Hull`.** From 8a24ddfdf330239323080ac4c203ad40e6cb5455 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Sun, 19 Apr 2026 14:38:31 -0500 Subject: [PATCH 36/60] NanoVDB: add LeafData::getValueBranchless + use leaf-only ReadAccessor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a branchless sibling to LeafData::getValue, and switches the CPU stencil-gather test scaffolding to use a leaf-only ReadAccessor variant. Motivated by the PMU-counter investigation in BatchAccessor.md §8j: the single data-dependent branch inside LeafData::getValue -- if (!(word & bit)) return 0; -- is the dominant cost for hot stencil loops over OnIndex trees, driving branch-mispredict rates to 8-10% on random-access workloads (IPC ~2) and still to ~1.7% on spatially-coherent narrow-band workloads (IPC ~4.2 vs a potential ~5.5). NanoVDB.h: LeafData::getValueBranchless(uint32_t i) Same semantics as getValue -- returns 0 for inactive voxels, otherwise mOffset + prefix9(n) + popcount(w & (bit - 1u)) -- but replaces the early-return guard with an arithmetic mask gate: uint64_t mask = (w & bit) ? ~0ull : 0ull; return mask & sum; The ternary over two constant arms compiles to test + cmov on x86, not a conditional jump. Verified on GCC 13 at -O3 -march=native. Semantics are bit-for-bit identical to getValue (checksums match across both synthetic random-50% and real narrow-band workloads including OFF voxels). Scoped to LeafData (not LeafNode) as an opt-in expert path for neighbourhood-aware hot loops (BatchAccessor-class cachers, HaloStencilAccessor-style precomputation). The generic LeafNode and ReadAccessor APIs are unchanged. LegacyStencilAccessor: use ReadAccessor For GetValue workloads the upper/lower cache slots of DefaultReadAccessor are never consulted (OpT::LEVEL == 0 falls straight to mRoot on a leaf cache miss; NanoVDB.h:5387) and are only passively written during the walk. Switching to a 1-level leaf-only cache removes that passive bookkeeping. Measured effect: Legacy path sees no change (its pipeline is already stalled on branch mispredicts, extra stores overlap for free). The branchless path consistently improves by 4-9% -- when IPC is near peak, every retired instruction counts. ex_stencil_gather_cpu updates - legacy-branchless pass now calls leaf->data()->getValueBranchless instead of the hand-inlined formula it used during investigation. The checksum now matches LegacyStencilAccessor exactly (the hand- inlined version wrote non-zero garbage for OFF voxels); the new method is semantically a drop-in replacement. - center-hit and legacy-branchless passes both use the leaf-only ReadAccessor construction. Measured speedup (32 M ambient / 50% random occupancy / i9-285K, 24 cores wall clock): Legacy: 94.5 ms Legacy branchless: 34.3 ms (2.8x) Signed-off-by: Efty Sifakis --- nanovdb/nanovdb/NanoVDB.h | 29 +++++++++++++++++++ .../stencil_gather_cpu.cpp | 29 +++++++++---------- nanovdb/nanovdb/util/LegacyStencilAccessor.h | 14 +++++++-- 3 files changed, 54 insertions(+), 18 deletions(-) diff --git a/nanovdb/nanovdb/NanoVDB.h b/nanovdb/nanovdb/NanoVDB.h index 78955cd251..e8a179d702 100644 --- a/nanovdb/nanovdb/NanoVDB.h +++ b/nanovdb/nanovdb/NanoVDB.h @@ -4146,6 +4146,35 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData> (9u * n) & 511u; return sum; } + + /// @brief Branchless variant of getValue, intended for neighbourhood-aware + /// caching paths (e.g. nanovdb::BatchAccessor's scalar tail) where the + /// per-tap mValueMask.isOn(offset) test is data-dependent and unpredictable. + /// + /// Semantics are identical to getValue: returns 0 for inactive voxels, + /// otherwise returns mOffset + prefix9(n) + popcount(w & (bit - 1u)). + /// Implementation uses a ternary-constant gate that the compiler emits as + /// `test + cmov` on x86, avoiding the branch-mispredict storm observed on + /// workloads with low spatial coherence between tap positions and the + /// source mValueMask. See BatchAccessor.md §8j for the perf-counter + /// investigation that motivated this variant. + __hostdev__ uint64_t getValueBranchless(uint32_t i) const + { + const uint32_t n = i >> 6; + const uint64_t w = BaseT::mValueMask.words()[n]; + const uint64_t bit = uint64_t(1) << (i & 63u); + // prefix9 extraction: predictable branch (n=0 only for the first + // x-slice; compilers emit as cmov). Kept as a ternary rather than + // unconditional because the prefix-sum shift would be UB at n=0. + const uint64_t prefix = n == 0u ? uint64_t(0) + : (BaseT::mPrefixSum >> (9u * (n - 1u))) & 511u; + const uint64_t sum = BaseT::mOffset + prefix + util::countOn(w & (bit - 1u)); + // 0 for inactive voxels, all-ones mask for active. The ternary on + // two compile-time-constant arms compiles to test + cmov, not to a + // conditional jump -- the whole point of this method. + const uint64_t mask = (w & bit) ? ~uint64_t(0) : uint64_t(0); + return mask & sum; + } }; // LeafData // --------------------------> LeafData <------------------------------------ diff --git a/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp b/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp index e8911a63c6..dc6e35e033 100644 --- a/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp +++ b/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp @@ -559,7 +559,10 @@ static void runPerf( [&](const nanovdb::util::Range1D& range) { alignas(64) uint32_t leafIndex[BlockWidth]; alignas(64) uint16_t voxelOffset[BlockWidth]; - auto acc = grid->getAccessor(); + // Leaf-only cache (levels 1/2 would never be consulted for + // GetValue — see NanoVDB.h:5387 — and would only pay passive + // bookkeeping on miss). See LegacyStencilAccessor.h for rationale. + nanovdb::ReadAccessor acc(grid->tree().root()); uint64_t* bs0 = sums.data(); for (size_t bID = range.begin(); bID != range.end(); ++bID) { @@ -658,7 +661,10 @@ static void runPerf( [&](const nanovdb::util::Range1D& range) { alignas(64) uint32_t leafIndex[BlockWidth]; alignas(64) uint16_t voxelOffset[BlockWidth]; - auto acc = grid->getAccessor(); + // Leaf-only cache (levels 1/2 would never be consulted for + // GetValue — see NanoVDB.h:5387 — and would only pay passive + // bookkeeping on miss). See LegacyStencilAccessor.h for rationale. + nanovdb::ReadAccessor acc(grid->tree().root()); uint64_t* bs0 = sums.data(); for (size_t bID = range.begin(); bID != range.end(); ++bID) { @@ -682,22 +688,15 @@ static void runPerf( auto addTap = [&](int di, int dj, int dk) { const nanovdb::Coord c = center + nanovdb::Coord(di, dj, dk); const LeafT* leaf = acc.probeLeaf(c); - if (!leaf) return; // tap outside narrow band (still branches, - // but well-predicted for active-region voxels) + if (!leaf) return; // tap outside narrow band (predictable branch + // for active-region voxels) const uint32_t offset = (uint32_t(c[0] & 7) << 6) | (uint32_t(c[1] & 7) << 3) | uint32_t(c[2] & 7); - const uint32_t wordIdx = offset >> 6; - const uint64_t bit = uint64_t(1) << (offset & 63); - const uint64_t maskWord = leaf->valueMask().words()[wordIdx]; - // prefix9 extract — cmov'd by the compiler, not a branch-miss source - const uint64_t prefix = (wordIdx > 0) - ? (leaf->data()->mPrefixSum >> (9u * (wordIdx - 1u))) & 511u - : uint64_t(0); - // UNCONDITIONAL: no isOn test. For OFF voxels this computes a - // non-zero value but does no branch. - s += leaf->data()->mOffset + prefix - + __builtin_popcountll(maskWord & (bit - 1)); + // NanoVDB LeafData::getValueBranchless -- + // same formula as getValue but with the isOn check + // replaced by a cmov-style mask gate. + s += leaf->data()->getValueBranchless(offset); }; // Unroll all 18 WENO5 taps via the compile-time tuple. diff --git a/nanovdb/nanovdb/util/LegacyStencilAccessor.h b/nanovdb/nanovdb/util/LegacyStencilAccessor.h index 6fe63df257..a53c29ef5c 100644 --- a/nanovdb/nanovdb/util/LegacyStencilAccessor.h +++ b/nanovdb/nanovdb/util/LegacyStencilAccessor.h @@ -56,8 +56,16 @@ class LegacyStencilAccessor static constexpr int SIZE = int(std::tuple_size_v); public: + // Leaf-only ReadAccessor (cache level 0 only). The DefaultReadAccessor + // (levels 0/1/2) caches upper and lower nodes too, but those slots are + // never consulted during a GetValue cache-miss resolution -- the fallback + // goes straight to mRoot->getAndCache(...). Using a 1-level accessor + // removes passive bookkeeping of the upper/lower slots on every miss and + // keeps the benchmark honest about what's being measured. + using AccessorT = ReadAccessor; + explicit LegacyStencilAccessor(const GridT& grid) - : mAcc(grid.getAccessor()) {} + : mAcc(grid.tree().root()) {} // ------------------------------------------------------------------------- // moveTo -- resolve all SIZE tap indices for the voxel at @a center. @@ -107,8 +115,8 @@ class LegacyStencilAccessor std::tuple_element_t::dk)))), ...); } - DefaultReadAccessor mAcc; - uint64_t mStencil[SIZE]; + AccessorT mAcc; + uint64_t mStencil[SIZE]; }; } // namespace nanovdb From d06b0946a1339ea1dfee5390ab61aaf0c9153b99 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Sun, 19 Apr 2026 14:39:01 -0500 Subject: [PATCH 37/60] ex_narrowband_stencil_cpu: CPU stencil-gather benchmark on real .vdb input MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Counterpart to ex_stencil_gather_cpu, which uses a procedurally generated random-occupancy domain (a pathological worst case for branch prediction inside LeafData::getValue). This example loads an openvdb level-set FloatGrid from disk, converts it to a NanoVDB ValueOnIndex topology grid plus a separately-allocated float sidecar, and runs the same perf- decomposition battery on a workload with realistic spatial coherence. Purpose: quantify how much of the isOn-branch-mispredict cost (see BatchAccessor.md §8j) survives on real narrow-band traversals where consecutive tap queries mostly land on active voxels. Findings to be documented in the follow-on commit. Pipeline: openvdb::io::File(path) -- disk load -> openvdb::FloatGrid -- first FloatGrid, or --grid= -> nanovdb::tools::CreateNanoGrid -- builder .getHandle(channels=0) -- topology only .copyValues(sidecar.data()) -- float sidecar -> validateSidecarOrdering() -- 1000-voxel sanity check -> runPrototype + runPerf (identical to ex_stencil_gather_cpu) The separated-sidecar path avoids the double-conversion alternative that would be needed to opt out of the grid's blind-data. Sidecar ordering is verified at startup by comparing floatGrid.getValue(ijk) against sidecar[indexGrid.tree().getValue(ijk)] on 1000 random active voxels (opt-out via --skip-validation). The sidecar is plumbed through but not yet consumed by any stencil path -- it exists for future "fetch values via the sidecar" work. CLI: narrowband_stencil_cpu [--grid=] [--pass=] // same set as // ex_stencil_gather_cpu [--threads=] [--skip-validation] Both the legacy-branchless pass (calls LeafData::getValueBranchless, introduced in the prior commit) and the center-hit / legacy-branchless accessor construction (leaf-only ReadAccessor) match the updated ex_stencil_gather_cpu conventions. Signed-off-by: Efty Sifakis --- nanovdb/nanovdb/examples/CMakeLists.txt | 10 + .../narrowband_stencil_cpu.cpp | 969 ++++++++++++++++++ 2 files changed, 979 insertions(+) create mode 100644 nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp diff --git a/nanovdb/nanovdb/examples/CMakeLists.txt b/nanovdb/nanovdb/examples/CMakeLists.txt index 5c35199e91..37b6167d20 100644 --- a/nanovdb/nanovdb/examples/CMakeLists.txt +++ b/nanovdb/nanovdb/examples/CMakeLists.txt @@ -131,6 +131,16 @@ if(TARGET ex_stencil_gather_cpu) ${CMAKE_CURRENT_SOURCE_DIR}/../../..) endif() +# CPU-only stencil gather benchmark on a real narrow-band level-set .vdb +# (see BatchAccessor.md §8j for the isOn-branch investigation that motivates +# comparing random-occupancy vs realistic spatial-coherence workloads). +nanovdb_example(NAME "ex_narrowband_stencil_cpu" OPENVDB) +if(TARGET ex_narrowband_stencil_cpu) + target_compile_options(ex_narrowband_stencil_cpu PRIVATE -march=native -fopenmp-simd) + target_include_directories(ex_narrowband_stencil_cpu PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/../../..) +endif() + if(CUDAToolkit_FOUND) nanovdb_example(NAME "ex_make_mgpu_nanovdb") # requires cuRAND target_link_libraries(ex_make_mgpu_nanovdb PRIVATE CUDA::curand) diff --git a/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp b/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp new file mode 100644 index 0000000000..5f5dd0aed3 --- /dev/null +++ b/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp @@ -0,0 +1,969 @@ +// Copyright Contributors to the OpenVDB Project +// SPDX-License-Identifier: Apache-2.0 + +/*! + \file narrowband_stencil_cpu.cpp + + \brief CPU stencil gather on a real narrow-band level set loaded from .vdb. + + Counterpart to ex_stencil_gather_cpu, which uses a procedurally generated + random-occupancy domain. This example instead loads an openvdb level-set + FloatGrid from disk, converts it to a NanoVDB ValueOnIndex topology grid, + and harvests the source float values into a separately-allocated sidecar + buffer. Purpose: exercise the same perf-decomposition battery on a + workload with realistic spatial coherence — narrow-band taps are mostly + close to the surface, so the valueMask.isOn(offset) branch may be more + predictable than in the random-occupancy case (see BatchAccessor.md §8j). + + Pipeline: + openvdb::io::File(path) -- disk load + -> openvdb::GridBase::Ptr -- untyped handle + -> openvdb::FloatGrid -- typed, narrow-band + -> nanovdb::tools::CreateNanoGrid -- builder + .getHandle(channels=0) -- topology only + .copyValues(sidecar.data()) -- float sidecar + -> VBM + runPrototype + runPerf (identical to ex_stencil_gather_cpu) + + The sidecar is captured but not yet consumed by any stencil path -- plumbing + only, for future "fetch values via the sidecar" work. A one-time + validation check at startup compares FloatGrid.getValue(ijk) against + sidecar[indexGrid.tree().getValue(ijk)] on ~1000 random active voxels. + + Build: + Configured via CMakeLists.txt in the parent examples/ directory. + Requires OpenVDB (for .vdb IO). No CUDA. + + Usage: + narrowband_stencil_cpu [--grid=] + [--pass=] [--threads=] + [--skip-validation] +*/ + +#include +#include +#include // CreateNanoGrid builder, openToIndexVDB +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include // std::unique_ptr +#include +#include // std::accumulate (checksum) +#include +#include + +// ============================================================ +// Constants and type aliases +// ============================================================ + +static constexpr int Log2BlockWidth = 7; +static constexpr int BlockWidth = 1 << Log2BlockWidth; // 128 +static constexpr int SIMDw = 16; // StencilAccessor batch width + +using BuildT = nanovdb::ValueOnIndex; +using GridT = nanovdb::NanoGrid; +using LeafT = nanovdb::NanoLeaf; +using CPUVBM = nanovdb::tools::VoxelBlockManager; + +using SAccT = nanovdb::StencilAccessor; +using LegacyAccT = nanovdb::LegacyStencilAccessor; + +// Decomposition-only stencil: 18 taps all at (0,0,0). Measures the hybrid +// StencilAccessor's floor cost when no tap crosses a leaf boundary and every +// lookup hits the center leaf. Subtracting this from the Weno5 run isolates +// the cross-leaf overhead — BUT the 18 identical compile-time taps give the +// compiler a large CSE opportunity, biasing the number downward. +struct DegenerateStencil { + using Taps = std::tuple< + nanovdb::StencilPoint<0,0,0>, nanovdb::StencilPoint<0,0,0>, + nanovdb::StencilPoint<0,0,0>, nanovdb::StencilPoint<0,0,0>, + nanovdb::StencilPoint<0,0,0>, nanovdb::StencilPoint<0,0,0>, + nanovdb::StencilPoint<0,0,0>, nanovdb::StencilPoint<0,0,0>, + nanovdb::StencilPoint<0,0,0>, nanovdb::StencilPoint<0,0,0>, + nanovdb::StencilPoint<0,0,0>, nanovdb::StencilPoint<0,0,0>, + nanovdb::StencilPoint<0,0,0>, nanovdb::StencilPoint<0,0,0>, + nanovdb::StencilPoint<0,0,0>, nanovdb::StencilPoint<0,0,0>, + nanovdb::StencilPoint<0,0,0>, nanovdb::StencilPoint<0,0,0> + >; + // Empty Hull: prefetchHull becomes a no-op; center leaf is always cached + // by BatchAccessor's constructor / advance(). + using Hull = std::tuple<>; +}; +using DegAccT = nanovdb::StencilAccessor; + +// CSE-resistant in-leaf stencil: 18 distinct compile-time taps spanning the +// leaf's 8^3 footprint (all axes, 6 tap offsets in [0..6] per axis). Used +// via StencilAccessor::moveToInLeaf, which applies (voxel_local + tap) mod 8 +// to the center voxel — guaranteeing every tap accesses the center leaf +// while touching distinct mValueMask words across taps and across voxels. +// This isolates the hybrid's single-leaf floor without the CSE bias that +// DegenerateStencil suffers from. +struct InLeafStencil { + using Taps = std::tuple< + // x spans 0..6 (hits mValueMask words 0..6 depending on voxel's local x) + nanovdb::StencilPoint<0,0,0>, nanovdb::StencilPoint<1,0,0>, + nanovdb::StencilPoint<2,0,0>, nanovdb::StencilPoint<3,0,0>, + nanovdb::StencilPoint<4,0,0>, nanovdb::StencilPoint<5,0,0>, + // y spans 1..6 (different destY positions within a word) + nanovdb::StencilPoint<0,1,0>, nanovdb::StencilPoint<0,2,0>, + nanovdb::StencilPoint<0,3,0>, nanovdb::StencilPoint<0,4,0>, + nanovdb::StencilPoint<0,5,0>, nanovdb::StencilPoint<0,6,0>, + // z spans 1..6 + nanovdb::StencilPoint<0,0,1>, nanovdb::StencilPoint<0,0,2>, + nanovdb::StencilPoint<0,0,3>, nanovdb::StencilPoint<0,0,4>, + nanovdb::StencilPoint<0,0,5>, nanovdb::StencilPoint<0,0,6> + >; + using Hull = std::tuple<>; // moveToInLeaf skips prefetchHull entirely +}; +using InLeafAccT = nanovdb::StencilAccessor; + +// ============================================================ +// VDB file loading + sidecar harvest +// ============================================================ + +/// Picks the first openvdb::FloatGrid from the file (optionally by name). +/// Throws on any failure (file not found, no FloatGrid, etc.). +static openvdb::FloatGrid::Ptr +loadFloatGridFromVdb(const std::string& path, const std::string& gridName) +{ + openvdb::io::File file(path); + file.open(false); // delayed loading off + + openvdb::GridBase::Ptr base; + if (!gridName.empty()) { + if (!file.hasGrid(gridName)) + throw std::runtime_error( + "no grid named \"" + gridName + "\" in " + path); + base = file.readGrid(gridName); + } else { + // First FloatGrid wins. + openvdb::GridPtrVecPtr grids = file.getGrids(); + for (auto& g : *grids) { + if (g && g->isType()) { + base = g; // already fully loaded by getGrids() + break; + } + } + if (!base) + throw std::runtime_error("no openvdb::FloatGrid found in " + path); + } + file.close(); + + auto floatGrid = openvdb::gridPtrCast(base); + if (!floatGrid) + throw std::runtime_error("grid is not an openvdb::FloatGrid"); + return floatGrid; +} + +/// Convert an openvdb::FloatGrid into a NanoVDB ValueOnIndex topology grid +/// plus a separately-allocated std::vector sidecar, using the +/// CreateNanoGrid builder path (channels=0, no blind data in grid). +/// +/// The builder's internal mValIdx is populated by getHandle(), so the +/// subsequent copyValues() writes the FloatGrid's active voxel values into +/// the sidecar in the same order that leaf.getValue(offset) returns. +struct NarrowBandPayload { + nanovdb::GridHandle handle; + std::vector sidecar; +}; + +static NarrowBandPayload +convertToIndexGridWithSidecar(openvdb::FloatGrid& floatGrid) +{ + nanovdb::tools::CreateNanoGrid builder(floatGrid); + + NarrowBandPayload p; + p.handle = builder.template getHandle< + nanovdb::ValueOnIndex, nanovdb::HostBuffer>( + /*channels =*/ 0u, // no blind data + /*incStats =*/ false, + /*incTiles =*/ false); + + // valueCount() is only valid after getHandle with an index DstBuildT. + p.sidecar.resize(builder.valueCount()); + builder.template copyValues(p.sidecar.data()); + return p; +} + +/// One-time consistency check between the source FloatGrid and the +/// IndexGrid + sidecar pair. Samples N active voxels from the source, +/// verifies: floatGrid.getValue(ijk) == sidecar[indexGrid.tree().getValue(ijk)]. +/// Returns number of mismatches (0 == pass). +static uint64_t +validateSidecarOrdering(const openvdb::FloatGrid& floatGrid, + const nanovdb::NanoGrid& indexGrid, + const std::vector& sidecar, + size_t maxSamples = 1000) +{ + // Walk the source grid's active voxels; sample up to maxSamples of them. + const auto totalActive = floatGrid.activeVoxelCount(); + if (totalActive == 0) return 0; + + const size_t step = std::max(1, size_t(totalActive / maxSamples)); + auto indexAcc = indexGrid.getAccessor(); + + uint64_t checked = 0, mismatches = 0, firstReports = 0; + size_t strideCounter = 0; + + for (auto it = floatGrid.cbeginValueOn(); it; ++it) { + if ((strideCounter++ % step) != 0) continue; + + const openvdb::Coord& oc = it.getCoord(); + const nanovdb::Coord nc(oc.x(), oc.y(), oc.z()); + + const uint64_t idx = indexAcc.getValue(nc); + if (idx == 0 || idx >= sidecar.size()) { + ++mismatches; + if (firstReports++ < 5) + std::cerr << " sidecar OOB at (" << oc.x() << "," << oc.y() + << "," << oc.z() << "): idx=" << idx + << " sidecar.size=" << sidecar.size() << "\n"; + continue; + } + const float expected = it.getValue(); + const float actual = sidecar[idx]; + if (expected != actual) { + ++mismatches; + if (firstReports++ < 5) + std::cerr << " sidecar MISMATCH at (" << oc.x() << "," << oc.y() + << "," << oc.z() << "): idx=" << idx + << " expected=" << expected + << " actual=" << actual << "\n"; + } + ++checked; + if (checked >= maxSamples) break; + } + + std::cout << "Sidecar validation: checked=" << checked + << " mismatches=" << mismatches + << (mismatches == 0 ? " PASSED\n" : " FAILED\n"); + return mismatches; +} + +// ============================================================ +// Verification +// ============================================================ + +struct VerifyStats { + uint64_t laneChecks = 0; + uint64_t errors = 0; +}; + +/// Cross-validate one StencilAccessor batch against LegacyStencilAccessor. +/// +/// Active lanes (leafIndex[p] != UnusedLeafIndex): reconstruct the global +/// coordinate from (leafIndex, voxelOffset), call legacyAcc.moveTo(), and +/// compare all SIZE tap indices element-by-element. +/// +/// Inactive lanes: assert all tap slots in stencilAcc hold 0 (background index). +static void verifyStencilAccessor( + const SAccT& stencilAcc, + const uint32_t* leafIndex, + const uint16_t* voxelOffset, + int batchStart, + const LeafT* firstLeaf, + LegacyAccT& legacyAcc, + VerifyStats& stats) +{ + for (int i = 0; i < SIMDw; ++i) { + const int p = batchStart + i; + const uint32_t li = leafIndex[p]; + + if (li == CPUVBM::UnusedLeafIndex) { + // Inactive lane: all tap slots must hold 0 (NanoVDB background index). + for (int k = 0; k < stencilAcc.size(); ++k) { + ++stats.laneChecks; + const uint64_t got = stencilAcc.mIndices[k][i]; + if (got != 0) { + ++stats.errors; + if (stats.errors <= 10) + std::cerr << "STENCIL inactive lane=" << i + << " tap=" << k + << ": expected 0, got " << got << "\n"; + } + } + continue; + } + + // Active lane: compare against the LegacyStencilAccessor oracle. + const uint16_t vo = voxelOffset[p]; + const nanovdb::Coord cOrigin = firstLeaf[li].origin(); + const int lx = (vo >> 6) & 7, ly = (vo >> 3) & 7, lz = vo & 7; + + legacyAcc.moveTo(cOrigin + nanovdb::Coord(lx, ly, lz)); + + for (int k = 0; k < stencilAcc.size(); ++k) { + ++stats.laneChecks; + const uint64_t expected = legacyAcc[k]; + const uint64_t actual = stencilAcc.mIndices[k][i]; + if (actual != expected) { + ++stats.errors; + if (stats.errors <= 10) + std::cerr << "STENCIL MISMATCH" + << " tap=" << k + << " lane=" << i + << " expected=" << expected + << " actual=" << actual << "\n"; + } + } + } +} + +// ============================================================ +// Correctness run: cross-validate StencilAccessor vs LegacyStencilAccessor +// ============================================================ + +static void runPrototype( + const GridT* grid, + const nanovdb::tools::VoxelBlockManagerHandle& vbmHandle) +{ + const LeafT* firstLeaf = grid->tree().getFirstNode<0>(); + const uint64_t nVoxels = grid->activeVoxelCount(); + const uint32_t nBlocks = (uint32_t)vbmHandle.blockCount(); + + const uint32_t* firstLeafID = vbmHandle.hostFirstLeafID(); + const uint64_t* jumpMap = vbmHandle.hostJumpMap(); + + alignas(64) uint32_t leafIndex[BlockWidth]; + alignas(64) uint16_t voxelOffset[BlockWidth]; + + // LegacyStencilAccessor owns its ReadAccessor; one instance per thread. + LegacyAccT legacyAcc(*grid); + VerifyStats stats; + + for (uint32_t bID = 0; bID < nBlocks; ++bID) { + const uint64_t blockFirstOffset = + vbmHandle.firstOffset() + (uint64_t)bID * BlockWidth; + + CPUVBM::decodeInverseMaps( + grid, firstLeafID[bID], + &jumpMap[(uint64_t)bID * CPUVBM::JumpMapLength], + blockFirstOffset, leafIndex, voxelOffset); + + int nExtraLeaves = 0; + for (int w = 0; w < CPUVBM::JumpMapLength; ++w) + nExtraLeaves += nanovdb::util::countOn( + jumpMap[(uint64_t)bID * CPUVBM::JumpMapLength + w]); + + SAccT stencilAcc(*grid, firstLeafID[bID], (uint32_t)nExtraLeaves); + + for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { + stencilAcc.moveTo(leafIndex + batchStart, voxelOffset + batchStart); + verifyStencilAccessor(stencilAcc, + leafIndex, voxelOffset, batchStart, + firstLeaf, legacyAcc, stats); + } + } + + std::cout << "Correctness (StencilAccessor vs LegacyStencilAccessor):\n" + << " blocks = " << nBlocks << "\n" + << " voxels = " << nVoxels << "\n" + << " laneChecks = " << stats.laneChecks << "\n"; + + if (stats.errors == 0) + std::cout << " PASSED\n"; + else + std::cerr << " FAILED: " << stats.errors << " mismatches\n"; +} + +// ============================================================ +// End-to-end performance comparison (multithreaded) +// +// Both paths run the full pipeline inside util::forEach: +// decodeInverseMaps → coord extraction → stencil gather → sum → store +// +// decodeInverseMaps is deliberately included: its cost is identical for +// both paths (pure cancellation in the comparison) and including it avoids +// fine-grained intra-block timing artifacts. +// +// Anti-DCE artifact: for each active voxel, accumulate the sum of all 18 +// tap uint64_t indices and write to sums[bID * BlockWidth + i]. The final +// XOR checksum is printed, forcing the compiler to materialise the stores. +// +// Timing: nanovdb::util::Timer (steady_clock) around each forEach. +// warm pass discards its measurement; only the second pass is reported. +// +// Denominator: grid->activeVoxelCount() — same for both paths. +// ============================================================ + +static void runPerf( + const GridT* grid, + const nanovdb::tools::VoxelBlockManagerHandle& vbmHandle, + const std::string& passFilter = "all") +{ + // wantPass() returns true if this pass should run under the current filter. + // Supported names: "decode", "stencil", "degenerate", "inleaf", "framing", + // "center-hit", "legacy". "all" runs everything. + auto wantPass = [&](const char* name) { + return passFilter == "all" || passFilter == name; + }; + + const LeafT* firstLeaf = grid->tree().getFirstNode<0>(); + const uint64_t nVoxels = grid->activeVoxelCount(); + const uint32_t nBlocks = (uint32_t)vbmHandle.blockCount(); + const uint32_t* firstLeafID = vbmHandle.hostFirstLeafID(); + const uint64_t* jumpMap = vbmHandle.hostJumpMap(); + const uint64_t firstOffset = vbmHandle.firstOffset(); + + // Anti-DCE output array. Each thread writes its own non-overlapping + // range (bID * BlockWidth ... + BlockWidth - 1) — no synchronisation needed. + std::vector sums((size_t)nBlocks * BlockWidth, 0); + + std::ostringstream sink; // absorbs Timer's warm-pass "... " output + nanovdb::util::Timer timer; + + auto timeForEach = [&](auto&& body) -> double { + // warm pass + timer.start("", sink); + body(); + timer.elapsed(); + // timed pass + timer.start("", sink); + body(); + return static_cast(timer.elapsed()); + }; + + // ---- decodeInverseMaps-only baseline (both paths pay this cost) ---- + // Anti-DCE: XOR one uint64_t per block derived from leafIndex[] + voxelOffset[] + // so the compiler can't elide the decode work. + double decodeUs = 0.0; + if (wantPass("decode")) decodeUs = timeForEach([&] { + nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), + [&](const nanovdb::util::Range1D& range) { + alignas(64) uint32_t leafIndex[BlockWidth]; + alignas(64) uint16_t voxelOffset[BlockWidth]; + + for (size_t bID = range.begin(); bID != range.end(); ++bID) { + CPUVBM::decodeInverseMaps( + grid, firstLeafID[bID], + &jumpMap[bID * CPUVBM::JumpMapLength], + firstOffset + bID * BlockWidth, + leafIndex, voxelOffset); + + uint64_t acc = 0; + for (int i = 0; i < BlockWidth; ++i) + acc ^= (uint64_t(leafIndex[i]) << 16) | uint64_t(voxelOffset[i]); + sums[bID * BlockWidth] = acc; // one slot per block as anti-DCE + } + }); + }); + + // ---- StencilAccessor ---- + double stencilUs = 0.0; + uint64_t stencilChecksum = 0; + if (wantPass("stencil")) { + std::fill(sums.begin(), sums.end(), uint64_t(0)); + + stencilUs = timeForEach([&] { + nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), + [&](const nanovdb::util::Range1D& range) { + alignas(64) uint32_t leafIndex[BlockWidth]; + alignas(64) uint16_t voxelOffset[BlockWidth]; + + for (size_t bID = range.begin(); bID != range.end(); ++bID) { + CPUVBM::decodeInverseMaps( + grid, firstLeafID[bID], + &jumpMap[bID * CPUVBM::JumpMapLength], + firstOffset + bID * BlockWidth, + leafIndex, voxelOffset); + + int nExtraLeaves = 0; + for (int w = 0; w < CPUVBM::JumpMapLength; ++w) + nExtraLeaves += nanovdb::util::countOn( + jumpMap[bID * CPUVBM::JumpMapLength + w]); + + SAccT stencilAcc(*grid, firstLeafID[bID], (uint32_t)nExtraLeaves); + uint64_t* bs = sums.data() + bID * BlockWidth; + + for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { + stencilAcc.moveTo(leafIndex + batchStart, voxelOffset + batchStart); + for (int i = 0; i < SIMDw; ++i) { + if (leafIndex[batchStart + i] == CPUVBM::UnusedLeafIndex) continue; + uint64_t s = 0; + for (int k = 0; k < SAccT::size(); ++k) + s += stencilAcc.mIndices[k][i]; + bs[batchStart + i] = s; + } + } + } + }); + }); + + stencilChecksum = + std::accumulate(sums.begin(), sums.end(), uint64_t(0), + [](uint64_t a, uint64_t b) { return a ^ b; }); + } // end wantPass("stencil") + + // ---- Hybrid floor: DegenerateStencil (18 taps all at (0,0,0)) ---- + double degenerateUs = 0.0; + uint64_t degenerateChecksum = 0; + if (wantPass("degenerate")) { + std::fill(sums.begin(), sums.end(), uint64_t(0)); + degenerateUs = timeForEach([&] { + nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), + [&](const nanovdb::util::Range1D& range) { + alignas(64) uint32_t leafIndex[BlockWidth]; + alignas(64) uint16_t voxelOffset[BlockWidth]; + + for (size_t bID = range.begin(); bID != range.end(); ++bID) { + CPUVBM::decodeInverseMaps( + grid, firstLeafID[bID], + &jumpMap[bID * CPUVBM::JumpMapLength], + firstOffset + bID * BlockWidth, + leafIndex, voxelOffset); + + int nExtraLeaves = 0; + for (int w = 0; w < CPUVBM::JumpMapLength; ++w) + nExtraLeaves += nanovdb::util::countOn( + jumpMap[bID * CPUVBM::JumpMapLength + w]); + + DegAccT degAcc(*grid, firstLeafID[bID], (uint32_t)nExtraLeaves); + uint64_t* bs = sums.data() + bID * BlockWidth; + + for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { + degAcc.moveTo(leafIndex + batchStart, voxelOffset + batchStart); + for (int i = 0; i < SIMDw; ++i) { + if (leafIndex[batchStart + i] == CPUVBM::UnusedLeafIndex) continue; + uint64_t s = 0; + for (int k = 0; k < DegAccT::size(); ++k) + s += degAcc.mIndices[k][i]; + bs[batchStart + i] = s; + } + } + } + }); + }); + degenerateChecksum = + std::accumulate(sums.begin(), sums.end(), uint64_t(0), + [](uint64_t a, uint64_t b) { return a ^ b; }); + } // end wantPass("degenerate") + + // ---- Hybrid floor (CSE-resistant): 18 distinct taps wrapped to center leaf ---- + double inLeafUs = 0.0; + uint64_t inLeafChecksum = 0; + if (wantPass("inleaf")) { + std::fill(sums.begin(), sums.end(), uint64_t(0)); + inLeafUs = timeForEach([&] { + nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), + [&](const nanovdb::util::Range1D& range) { + alignas(64) uint32_t leafIndex[BlockWidth]; + alignas(64) uint16_t voxelOffset[BlockWidth]; + + for (size_t bID = range.begin(); bID != range.end(); ++bID) { + CPUVBM::decodeInverseMaps( + grid, firstLeafID[bID], + &jumpMap[bID * CPUVBM::JumpMapLength], + firstOffset + bID * BlockWidth, + leafIndex, voxelOffset); + + int nExtraLeaves = 0; + for (int w = 0; w < CPUVBM::JumpMapLength; ++w) + nExtraLeaves += nanovdb::util::countOn( + jumpMap[bID * CPUVBM::JumpMapLength + w]); + + InLeafAccT inLeafAcc(*grid, firstLeafID[bID], (uint32_t)nExtraLeaves); + uint64_t* bs = sums.data() + bID * BlockWidth; + + for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { + inLeafAcc.moveToInLeaf( + leafIndex + batchStart, voxelOffset + batchStart); + for (int i = 0; i < SIMDw; ++i) { + if (leafIndex[batchStart + i] == CPUVBM::UnusedLeafIndex) continue; + uint64_t s = 0; + for (int k = 0; k < InLeafAccT::size(); ++k) + s += inLeafAcc.mIndices[k][i]; + bs[batchStart + i] = s; + } + } + } + }); + }); + inLeafChecksum = + std::accumulate(sums.begin(), sums.end(), uint64_t(0), + [](uint64_t a, uint64_t b) { return a ^ b; }); + } // end wantPass("inleaf") + + // ---- Legacy cost decomposition variants ---- + // (a) "framing only" — Legacy loop structure, no accessor call (anti-DCE writes use li+k). + // Measures: decodeInverseMaps + Coord compute + 18-iteration inner loop + anti-DCE store. + // (b) "center-hit only" — Legacy loop + 18× mAcc.getValue(center) instead of tap offsets. + // Always hits the ReadAccessor's leaf cache → no tree walk. + // Measures: framing + cache-query + leaf-local lookup (mValueMask + mPrefixSum + popcount). + // (c) "full" — the original LegacyStencilAccessor path. + // Measures: framing + cache-query + leaf-local lookup + tree-walk-on-miss. + // + // Tree-walk cost per voxel ≈ full − center-hit. + // Cache + leaf-lookup per voxel ≈ center-hit − framing. + // Framing per voxel ≈ framing. + + double framingUs = 0.0; + if (wantPass("framing")) { + std::fill(sums.begin(), sums.end(), uint64_t(0)); + framingUs = timeForEach([&] { + nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), + [&](const nanovdb::util::Range1D& range) { + alignas(64) uint32_t leafIndex[BlockWidth]; + alignas(64) uint16_t voxelOffset[BlockWidth]; + uint64_t* bs0 = sums.data(); + + for (size_t bID = range.begin(); bID != range.end(); ++bID) { + CPUVBM::decodeInverseMaps( + grid, firstLeafID[bID], + &jumpMap[bID * CPUVBM::JumpMapLength], + firstOffset + bID * BlockWidth, + leafIndex, voxelOffset); + + uint64_t* bs = bs0 + bID * BlockWidth; + for (int i = 0; i < BlockWidth; ++i) { + if (leafIndex[i] == CPUVBM::UnusedLeafIndex) continue; + const uint16_t vo = voxelOffset[i]; + const uint32_t li = leafIndex[i]; + const nanovdb::Coord cOrigin = firstLeaf[li].origin(); + const int lx = (vo >> 6) & 7, ly = (vo >> 3) & 7, lz = vo & 7; + const nanovdb::Coord center = cOrigin + nanovdb::Coord(lx, ly, lz); + // 18 trivial "taps" — no accessor call; anti-DCE via Coord components. + uint64_t s = 0; + for (int k = 0; k < LegacyAccT::size(); ++k) + s += static_cast(center.x() + center.y() + center.z() + k); + bs[i] = s; + } + } + }); + }); + } // end wantPass("framing") + + double centerHitUs = 0.0; + if (wantPass("center-hit")) { + std::fill(sums.begin(), sums.end(), uint64_t(0)); + centerHitUs = timeForEach([&] { + nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), + [&](const nanovdb::util::Range1D& range) { + alignas(64) uint32_t leafIndex[BlockWidth]; + alignas(64) uint16_t voxelOffset[BlockWidth]; + // Leaf-only cache (levels 1/2 would never be consulted for + // GetValue — see NanoVDB.h:5387 — and would only pay passive + // bookkeeping on miss). See LegacyStencilAccessor.h for rationale. + nanovdb::ReadAccessor acc(grid->tree().root()); + uint64_t* bs0 = sums.data(); + + for (size_t bID = range.begin(); bID != range.end(); ++bID) { + CPUVBM::decodeInverseMaps( + grid, firstLeafID[bID], + &jumpMap[bID * CPUVBM::JumpMapLength], + firstOffset + bID * BlockWidth, + leafIndex, voxelOffset); + + uint64_t* bs = bs0 + bID * BlockWidth; + for (int i = 0; i < BlockWidth; ++i) { + if (leafIndex[i] == CPUVBM::UnusedLeafIndex) continue; + (void)voxelOffset[i]; // keep decode non-dead + const uint32_t li = leafIndex[i]; + const nanovdb::Coord cOrigin = firstLeaf[li].origin(); + // 18 distinct positions ALL within this leaf's 8^3 footprint + // — guarantees leaf-cache hit on every call, but each coord + // is unique so the compiler can't CSE the lookups. + // k in [0..17]: local (k&7, (k>>3)&1, 0) sweeps an 8x2x1 slab. + uint64_t s = 0; + for (int k = 0; k < LegacyAccT::size(); ++k) { + const nanovdb::Coord c = cOrigin + + nanovdb::Coord(k & 7, (k >> 3) & 1, 0); + s += static_cast(acc.getValue(c)); + } + bs[i] = s; + } + } + }); + }); + + } // end wantPass("center-hit") + + // ---- LegacyStencilAccessor ---- + double legacyUs = 0.0; + uint64_t legacyChecksum = 0; + if (wantPass("legacy")) { + std::fill(sums.begin(), sums.end(), uint64_t(0)); + + legacyUs = timeForEach([&] { + nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), + [&](const nanovdb::util::Range1D& range) { + alignas(64) uint32_t leafIndex[BlockWidth]; + alignas(64) uint16_t voxelOffset[BlockWidth]; + LegacyAccT legacyAcc(*grid); // one ReadAccessor per task + uint64_t* bs0 = sums.data(); + + for (size_t bID = range.begin(); bID != range.end(); ++bID) { + CPUVBM::decodeInverseMaps( + grid, firstLeafID[bID], + &jumpMap[bID * CPUVBM::JumpMapLength], + firstOffset + bID * BlockWidth, + leafIndex, voxelOffset); + + uint64_t* bs = bs0 + bID * BlockWidth; + + for (int i = 0; i < BlockWidth; ++i) { + if (leafIndex[i] == CPUVBM::UnusedLeafIndex) continue; + const uint16_t vo = voxelOffset[i]; + const uint32_t li = leafIndex[i]; + const nanovdb::Coord cOrigin = firstLeaf[li].origin(); + const int lx = (vo >> 6) & 7, ly = (vo >> 3) & 7, lz = vo & 7; + legacyAcc.moveTo(cOrigin + nanovdb::Coord(lx, ly, lz)); + uint64_t s = 0; + for (int k = 0; k < LegacyAccT::size(); ++k) s += legacyAcc[k]; + bs[i] = s; + } + } + }); + }); + + legacyChecksum = + std::accumulate(sums.begin(), sums.end(), uint64_t(0), + [](uint64_t a, uint64_t b) { return a ^ b; }); + } // end wantPass("legacy") + + // ---- Legacy branchless: same as legacy but skip the leaf.getValue isOn branch ---- + // Replaces `leaf.getValue(offset)` (which branches on valueMask.isOn(offset)) + // with the unconditional formula: + // mOffset + prefix9(wordIdx) + popcount(maskWord & ((1<); + + legacyBranchlessUs = timeForEach([&] { + nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), + [&](const nanovdb::util::Range1D& range) { + alignas(64) uint32_t leafIndex[BlockWidth]; + alignas(64) uint16_t voxelOffset[BlockWidth]; + // Leaf-only cache (levels 1/2 would never be consulted for + // GetValue — see NanoVDB.h:5387 — and would only pay passive + // bookkeeping on miss). See LegacyStencilAccessor.h for rationale. + nanovdb::ReadAccessor acc(grid->tree().root()); + uint64_t* bs0 = sums.data(); + + for (size_t bID = range.begin(); bID != range.end(); ++bID) { + CPUVBM::decodeInverseMaps( + grid, firstLeafID[bID], + &jumpMap[bID * CPUVBM::JumpMapLength], + firstOffset + bID * BlockWidth, + leafIndex, voxelOffset); + + uint64_t* bs = bs0 + bID * BlockWidth; + + for (int i = 0; i < BlockWidth; ++i) { + if (leafIndex[i] == CPUVBM::UnusedLeafIndex) continue; + const uint16_t vo = voxelOffset[i]; + const uint32_t li = leafIndex[i]; + const nanovdb::Coord cOrigin = firstLeaf[li].origin(); + const int lx = (vo >> 6) & 7, ly = (vo >> 3) & 7, lz = vo & 7; + const nanovdb::Coord center = cOrigin + nanovdb::Coord(lx, ly, lz); + + uint64_t s = 0; + auto addTap = [&](int di, int dj, int dk) { + const nanovdb::Coord c = center + nanovdb::Coord(di, dj, dk); + const LeafT* leaf = acc.probeLeaf(c); + if (!leaf) return; // tap outside narrow band (predictable branch + // for active-region voxels) + const uint32_t offset = (uint32_t(c[0] & 7) << 6) + | (uint32_t(c[1] & 7) << 3) + | uint32_t(c[2] & 7); + // NanoVDB LeafData::getValueBranchless -- + // same formula as getValue but with the isOn check + // replaced by a cmov-style mask gate. + s += leaf->data()->getValueBranchless(offset); + }; + + // Unroll all 18 WENO5 taps via the compile-time tuple. + [&](std::index_sequence) { + (addTap( + std::tuple_element_t::di, + std::tuple_element_t::dj, + std::tuple_element_t::dk + ), ...); + }(std::make_index_sequence{}); + + bs[i] = s; + } + } + }); + }); + + legacyBranchlessChecksum = + std::accumulate(sums.begin(), sums.end(), uint64_t(0), + [](uint64_t a, uint64_t b) { return a ^ b; }); + } // end wantPass("legacy-branchless") + + std::printf("\nEnd-to-end stencil gather (%u blocks, %lu active voxels):\n", + nBlocks, nVoxels); + std::printf(" decodeInverseMaps only: %7.1f ms (%5.1f ns/voxel)\n", + decodeUs / 1e3, decodeUs * 1e3 / double(nVoxels)); + std::printf(" StencilAccessor : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", + stencilUs / 1e3, stencilUs * 1e3 / double(nVoxels), + (stencilUs - decodeUs) / 1e3, stencilChecksum); + std::printf(" Degenerate (18×center): %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", + degenerateUs / 1e3, degenerateUs * 1e3 / double(nVoxels), + (degenerateUs - decodeUs) / 1e3, degenerateChecksum); + std::printf(" InLeaf (18 distinct) : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", + inLeafUs / 1e3, inLeafUs * 1e3 / double(nVoxels), + (inLeafUs - decodeUs) / 1e3, inLeafChecksum); + std::printf(" LegacyStencilAccessor : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", + legacyUs / 1e3, legacyUs * 1e3 / double(nVoxels), + (legacyUs - decodeUs) / 1e3, legacyChecksum); + std::printf(" Legacy branchless : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", + legacyBranchlessUs / 1e3, legacyBranchlessUs * 1e3 / double(nVoxels), + (legacyBranchlessUs - decodeUs) / 1e3, legacyBranchlessChecksum); + + // Decomposition of LegacyStencilAccessor's ns/voxel: + // framing = no accessor call + // cache + leaf = centerHit − framing (per 18 taps) + // tree walk = legacy − centerHit (per 18 taps; amortises over ~25% miss rate) + const double framingNs = framingUs * 1e3 / double(nVoxels); + const double centerHitNs = centerHitUs * 1e3 / double(nVoxels); + const double legacyNs = legacyUs * 1e3 / double(nVoxels); + std::printf("\nLegacy cost decomposition (18 taps/voxel):\n"); + std::printf(" framing only : %7.1f ms (%5.1f ns/voxel)\n", + framingUs / 1e3, framingNs); + std::printf(" + center-hit × 18 : %7.1f ms (%5.1f ns/voxel) [cache+leaf = %5.2f ns/vox = %4.2f ns/tap]\n", + centerHitUs / 1e3, centerHitNs, + centerHitNs - framingNs, (centerHitNs - framingNs) / 18.0); + std::printf(" + stencil × 18 (full): %7.1f ms (%5.1f ns/voxel) [tree walk = %5.2f ns/vox = %4.2f ns/tap]\n", + legacyUs / 1e3, legacyNs, + legacyNs - centerHitNs, (legacyNs - centerHitNs) / 18.0); + + if (stencilChecksum != legacyChecksum) + std::cerr << " WARNING: checksums differ — accessor results disagree!\n"; +} + +// ============================================================ +// Entry point +// ============================================================ + +static void printUsage(const char* argv0) +{ + std::cerr + << "Usage: " << argv0 << " " + << " [--grid=]" + << " [--pass=]" + << " [--threads=]" + << " [--skip-validation]\n" + << "\n" + << " Input OpenVDB file (single FloatGrid narrow-band)\n" + << " --grid= Select grid by name (default: first FloatGrid)\n" + << " --pass= Run one perf pass:\n" + << " all (default), verify, decode, stencil,\n" + << " degenerate, inleaf, framing, center-hit,\n" + << " legacy, legacy-branchless\n" + << " --threads= Limit TBB parallelism (0 = TBB default)\n" + << " --skip-validation Skip the sidecar ordering sanity check\n"; +} + +int main(int argc, char** argv) +{ + try { + if (argc < 2 || std::string(argv[1]) == "--help" + || std::string(argv[1]) == "-h") { + printUsage(argv[0]); + return argc < 2 ? 1 : 0; + } + + std::string vdbPath = argv[1]; + std::string gridName = ""; // --grid= + std::string passFilter = "all"; // --pass= + int nThreads = 0; // --threads=, 0 = TBB default + bool skipValidation = false; + + for (int i = 2; i < argc; ++i) { + std::string a = argv[i]; + if (a.rfind("--grid=", 0) == 0) gridName = a.substr(7); + else if (a.rfind("--pass=", 0) == 0) passFilter = a.substr(7); + else if (a.rfind("--threads=", 0) == 0) nThreads = std::stoi(a.substr(10)); + else if (a == "--skip-validation") skipValidation = true; + else { printUsage(argv[0]); return 1; } + } + + std::cout << "vdb path = " << vdbPath << "\n" + << "grid name = " << (gridName.empty() ? "(first FloatGrid)" : gridName) << "\n" + << "pass = " << passFilter << "\n" + << "threads = " << (nThreads > 0 ? std::to_string(nThreads) : std::string("(TBB default)")) << "\n"; + + // ---- OpenVDB setup and .vdb load ---- + openvdb::initialize(); + auto floatGrid = loadFloatGridFromVdb(vdbPath, gridName); + + const auto bbox = floatGrid->evalActiveVoxelBoundingBox(); + const auto dim = bbox.dim(); + const auto vsize = floatGrid->voxelSize(); + std::cout << "FloatGrid:\n" + << " name = \"" << floatGrid->getName() << "\"\n" + << " active voxels = " << floatGrid->activeVoxelCount() << "\n" + << " bbox = [" << bbox.min() << " .. " << bbox.max() << "]" + << " dim=" << dim << "\n" + << " voxel size = " << vsize << "\n" + << " background = " << floatGrid->background() << "\n"; + + // ---- Convert to NanoVDB IndexGrid + separately-allocated float sidecar ---- + auto payload = convertToIndexGridWithSidecar(*floatGrid); + auto* grid = payload.handle.grid(); + if (!grid) throw std::runtime_error("Failed to create ValueOnIndex grid"); + + const auto& tree = grid->tree(); + std::cout << "IndexGrid:\n" + << " leaves = " << tree.nodeCount(0) << "\n" + << " lower nodes = " << tree.nodeCount(1) << "\n" + << " upper nodes = " << tree.nodeCount(2) << "\n" + << " active voxels = " << grid->activeVoxelCount() << "\n" + << " valueCount = " << grid->valueCount() << "\n" + << " sidecar entries = " << payload.sidecar.size() << "\n"; + + // ---- Sidecar ordering sanity check ---- + if (!skipValidation) { + if (validateSidecarOrdering(*floatGrid, *grid, payload.sidecar) != 0) + throw std::runtime_error( + "sidecar ordering mismatch -- aborting before benchmarks"); + } + + // ---- VBM ---- + auto vbmHandle = nanovdb::tools::buildVoxelBlockManager(grid); + std::cout << "VBM:\n" + << " blocks = " << vbmHandle.blockCount() + << " (BlockWidth=" << BlockWidth << ")\n\n"; + + // TBB thread-count limit for perf measurements. + std::unique_ptr tbbLimit; + if (nThreads > 0) { + tbbLimit = std::make_unique( + tbb::global_control::max_allowed_parallelism, (size_t)nThreads); + } + + if (passFilter == "all" || passFilter == "verify") + runPrototype(grid, vbmHandle); + runPerf(grid, vbmHandle, passFilter); + + // Silence unused-variable warning for sidecar until a future pass uses it. + (void)payload.sidecar; + } catch (const std::exception& e) { + std::cerr << "Exception: " << e.what() << "\n"; + return 1; + } + return 0; +} From 1847b1a6649c9deb49a1a61b629973c505f6b7d8 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Sun, 19 Apr 2026 14:44:55 -0500 Subject: [PATCH 38/60] BatchAccessor/StencilAccessor: document getValueBranchless, narrow-band findings, leaf-only accessor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-on chronicle for the investigation in §8j. Three things happened since: (1) the hand-inlined branchless variant was promoted into NanoVDB.h proper as LeafData::getValueBranchless; (2) a real narrow-band benchmark (ex_narrowband_stencil_cpu, taperLER.vdb) was added to validate the finding on a workload with realistic spatial coherence; (3) we noticed the scaffolding was using a 3-level ReadAccessor where only the leaf-level cache can contribute and switched to ReadAccessor. BatchAccessor.md §8k (new, ~170 lines) — the new findings: - §8k.1 The new API: getValueBranchless code, semantics (identical to getValue; checksum matches byte-for-byte), scope decision (LeafData only, not LeafNode/ReadAccessor), note that the shipped version includes the OFF-returns-0 mask gate that the hand-inlined benchmarking variant omitted. - §8k.2 The new ex_narrowband_stencil_cpu benchmark — .vdb loading, topology-only IndexGrid + separately-allocated float sidecar path, startup sidecar validation. - §8k.3 Measurement matrix (single-P-core, PMU counters): Legacy vs getValueBranchless on synthetic vs narrow-band. Refines §8j: narrow-band is NOT pathological for branch prediction (1.74% miss rate, IPC 4.22); random-access is (8.07% miss rate, IPC 1.96). getValueBranchless recovers near-peak IPC on both. - §8k.4 Accessor cache-level finding: 3-level ReadAccessor's upper and lower cache slots are never consulted for GetValue. Switching to a 1-level leaf-only accessor gives 4-9% additional speedup on branchless paths (no measurable effect on Legacy, which is backend-bound on mispredicts). Scope caveat: benchmark-only; the library default stays at 3-level for mixed workloads. - §8k.5 Updated end-to-end headline numbers: 24-core Arrow Lake, narrow-band 85 → 60 ms (1.4×), synthetic 95 → 34 ms (2.8×). - §8k.6 What the §10 Remaining list should now show as complete, and future follow-ons (ProbeValue variant that reuses getValueBranchless, steering-team proposal). BatchAccessor.md §10 "Completed" updated with three new bullets: - getValueBranchless shipped (§8k) - ex_narrowband_stencil_cpu shipped (§8k.2) - Leaf-only accessor switch in benchmark scaffolding (§8k.4) Removed the "branchless LeafNode::getValue" item from "Remaining" (done, at LeafData level per scope decision). StencilAccessor.md §8.2 updated: the "cheap architectural win" bullet now points at the shipped getValueBranchless rather than the "would be" drafting. The HaloStencilAccessor bullet reframed: its remaining advantage is "zero per-tap work at query time" now that the mispredict storm is already addressable by getValueBranchless. Cross-references §8j and §8k. Signed-off-by: Efty Sifakis --- nanovdb/nanovdb/util/BatchAccessor.md | 192 +++++++++++++++++++++++- nanovdb/nanovdb/util/StencilAccessor.md | 31 ++-- 2 files changed, 202 insertions(+), 21 deletions(-) diff --git a/nanovdb/nanovdb/util/BatchAccessor.md b/nanovdb/nanovdb/util/BatchAccessor.md index f861d3f141..7b09661fec 100644 --- a/nanovdb/nanovdb/util/BatchAccessor.md +++ b/nanovdb/nanovdb/util/BatchAccessor.md @@ -1283,6 +1283,169 @@ the perf numbers in this section change when it is toggled. | "`NANOVDB_USE_INTRINSICS` matters for popcount-heavy paths" | general assumption | No-op on GCC `-O3 -march=native`: SWAR → popcnt pattern match. Enable for portability anyway. | | "27-leaf cache is the architectural win of BatchAccessor" | §8i "Cost of the refactor" | Cache delta is ~0.3 ns/voxel. Real wins are the Simd-free API and flatten-free compiler portability (§8i). | +### 8k. Follow-up: `LeafData::getValueBranchless`, narrow-band validation, and accessor cache-level + +Follow-on to §8j. Three things happened: +(1) the branchless reformulation of `leaf.getValue` was moved from a +hand-inlined benchmark hack into `NanoVDB.h` proper as a new method on +`LeafData`; +(2) a second example (`ex_narrowband_stencil_cpu`) was added to validate +the finding on a real narrow-band level set rather than a pathological +random-occupancy synthetic; +(3) we noticed the scaffolding was using the default 3-level +`ReadAccessor` when only the leaf-level cache can +actually contribute, and switched to `ReadAccessor`. + +#### 8k.1 The new API: `LeafData::getValueBranchless` + +Located at `NanoVDB.h:4161`, sibling to the existing `getValue` at 4139. +Same signature, same inputs, bit-for-bit identical output: + +```cpp +__hostdev__ uint64_t getValueBranchless(uint32_t i) const +{ + const uint32_t n = i >> 6; + const uint64_t w = BaseT::mValueMask.words()[n]; + const uint64_t bit = uint64_t(1) << (i & 63u); + const uint64_t prefix = n == 0u ? uint64_t(0) + : (BaseT::mPrefixSum >> (9u*(n-1u))) & 511u; + const uint64_t sum = BaseT::mOffset + prefix + util::countOn(w & (bit - 1u)); + const uint64_t mask = (w & bit) ? ~uint64_t(0) : uint64_t(0); + return mask & sum; +} +``` + +Key design points: +- Scoped to `LeafData` (not `LeafNode`) — opt-in expert path for + neighbourhood-aware cachers; the generic `LeafNode::getValue` and the + `ReadAccessor::getValue` chain are unchanged. +- The ternary `(w & bit) ? ~0ull : 0ull` compiles to `test + cmov` on + x86 (verified on GCC 13 / `-O3 -march=native`), eliminating the + mispredict-prone conditional-jump pattern of the original `getValue`. +- The prefix-extract ternary (`n == 0u ? 0 : …`) is kept as-is — its + outcome is 7:1 biased and the predictor handles it cleanly, so + expanding it to branchless arithmetic wouldn't help and would risk + tripping UB on the `n-1` shift for `n==0`. +- OFF voxels still return 0 (gated by the mask-AND at the end), so the + method is a drop-in replacement for `getValue`. **Checksum matches + byte-for-byte on all measured workloads.** + +During the earlier investigation we'd used a hand-inlined variant that +skipped the gate — faster (~5% on single-thread), semantically wrong +(OFF voxels returned the formula's non-zero junk). The shipped method +includes the gate and is the correct drop-in. + +#### 8k.2 `ex_narrowband_stencil_cpu` — realistic workload benchmark + +New example under `nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/`. +Structurally a clone of `ex_stencil_gather_cpu` (same `--pass=` / +`--threads=` CLI, same set of decomposition variants), but replaces +the procedural random-occupancy domain with `.vdb` file loading: + +- `openvdb::io::File(path).readGrid(name)` → `openvdb::FloatGrid` +- `nanovdb::tools::CreateNanoGrid(grid).getHandle< + ValueOnIndex, HostBuffer>(channels=0, ...)` → topology-only `NanoGrid` +- `builder.copyValues(sidecar.data())` → separately- + allocated `std::vector` sidecar (no blind-data residue in the + grid). Ordering sanity-checked at startup (1000 samples). + +The sidecar is plumbed through but not yet consumed by any stencil path +— placeholder for future "fetch values via the sidecar" work. + +Test input: `taperLER.vdb`, a ~129 MB narrow-band `UnsignedDistanceField` +FloatGrid with 31.8 M active voxels over a 1125×1081×762 bbox. + +#### 8k.3 Narrow-band vs synthetic measurement matrix + +Single P-core, `--threads=1`, PMU counters, `-O3 -march=native`: + +| Variant | Workload | ns/voxel | IPC | branch-miss | L1 miss | +|---------------------|-------------|---------:|-----:|------------:|--------:| +| Legacy | narrow-band | 47.0 | 4.22 | 1.74 % | 0.06 % | +| `getValueBranchless`| narrow-band | **34.5** | **5.55** | **0.45 %** | 0.07 % | +| Legacy | synthetic | 106.1 | 1.96 | 8.07 % | 0.36 % | +| `getValueBranchless`| synthetic | **37.9** | **4.55** | **1.63 %** | 0.39 % | + +Two observations that refine §8j: + +1. **Narrow-band is *not* pathological for branch prediction.** At 1.74 % + miss rate the branch predictor handles spatially-coherent traversals + well enough that the original `getValue` runs at IPC ~4.2 (near peak + for integer code). The isOn branch is only catastrophic when access + patterns are genuinely unpredictable; narrow-band SDF walks aren't. +2. **`getValueBranchless` still wins on narrow-band** (47→34.5 ns/vox, + 1.4×) because the branch is still data-dependent even if mostly + predictable — every ~1 in 60 calls costs ~15 cycles. On synthetic + the benefit is much larger (2.8×) because there's a genuine + mispredict storm to eliminate. + +Per-call instruction count is within a handful of `getValue` in both +cases; L1 behaviour is identical. The speedup is entirely +branch-mispredict-pipeline-stall recovery. + +#### 8k.4 Accessor cache-level finding + +The `ReadAccessor` (`DefaultReadAccessor`) maintains +three cache slots (leaf, lower, upper). For `GetValue` workloads the +upper/lower slots are **never consulted** on a leaf-cache miss — +`ReadAccessor::get` falls straight through to `mRoot->getAndCache` +(NanoVDB.h:5387) — they're only written as passive side-effects of the +root-walk's `acc.insert(ijk, child)` calls at each level. + +Switching the scaffolding to `ReadAccessor` +(`LegacyStencilAccessor.h`, plus the `center-hit` / `legacy-branchless` +passes of both examples) removes those passive writes. Measured 32- +thread wall-clock deltas: + +| Workload, config | Legacy | `getValueBranchless` | +|---------------------------|--------------:|---------------------:| +| narrow-band, 8 P-cores | no change | 140.0 → 132.1 ms (−5.6 %) | +| narrow-band, 24 cores | no change | 66.1 → 60.3 ms (−8.8 %) | +| synthetic, 8 P-cores | no change | 80.8 → 76.8 ms (−5.0 %) | +| synthetic, 24 cores | no change | 35.8 → 34.3 ms (−4.2 %) | + +Legacy paths are backend-bound on mispredicts — the extra stores +overlap for free in the stall cycles. The branchless paths run at +near-peak IPC (~5.5) where there is no slack, so every retired +instruction shows up. Classic Amdahl corollary: the closer to peak, +the more every small thing matters. + +**Scope caveat** (for any future "should the library default change" +discussion): the 1-level accessor is strictly better only for +`GetValue`-only hot loops. `probeValue`, `probeLeaf`, and +`isActive`/`GetState` queries do traverse at levels ≥ 1 and benefit from +the upper/lower slots. `DefaultReadAccessor` is the right default for +mixed workloads; opt into 1-level only when you know the loop is +`GetValue`-exclusive. + +#### 8k.5 End-to-end headline numbers (updated) + +24-core Arrow Lake, full pipeline including decode: + +| Workload | Legacy | `getValueBranchless` | Speedup | +|-----------------------------------|-------:|---------------------:|--------:| +| Narrow-band taperLER (31.8 M) | 85 ms | **60 ms** | 1.4 × | +| Synthetic random 50% (16.7 M) | 95 ms | **34 ms** | 2.8 × | + +Speedup is thread-count-independent (same ratio across 8 P-cores and +24 cores). The two workloads' speedup *spread* — 1.4 × vs 2.8 × — +tracks exactly how unpredictable the isOn branch is for each pattern. + +#### 8k.6 What this updates in the §10 Remaining list + +The "Branchless `LeafNode::getValue`" item is complete +(shipped at the `LeafData` level per the scope decision, with benchmark +coverage on both synthetic and real narrow-band workloads). Future +follow-ons implied by this work but not pursued here: +- A `ProbeValue::get` variant that reuses `getValueBranchless` and the + already-computed `(w & bit)` to eliminate the redundant second + `isOn` test at NanoVDB.h:6302–6306. +- Steering-team proposal for the NanoVDB library: adopt + `getValueBranchless` as a public API (or possibly as the default for + `LeafData::getValue`, if the single-thread ~14 % + instruction-count increase is acceptable given its branchless + universal applicability). + --- ## 9. Relationship to Phase 1 Prototype @@ -1352,15 +1515,28 @@ the perf numbers in this section change when it is toggled. (SWAR `util::countOn` is pattern-matched to hardware `popcnt`). Enable it in the build anyway for portability. -### Remaining +- **`LeafData::getValueBranchless` in `NanoVDB.h` (§8k)**: + shipped. Branchless sibling to `getValue`; same semantics, `test+cmov` + gate instead of a conditional jump. Validated on both synthetic random + 50% (2.8× end-to-end speedup on 24 cores) and real narrow-band + `taperLER.vdb` (1.4× speedup). + +- **`ex_narrowband_stencil_cpu` (§8k.2)**: new `.vdb`-based benchmark + companion to `ex_stencil_gather_cpu`. Loads an openvdb `FloatGrid`, + converts to `ValueOnIndex` topology + separately-allocated float + sidecar, runs the same perf-decomposition battery on realistic + narrow-band workloads. + +- **Leaf-only `ReadAccessor` in benchmark scaffolding + (§8k.4)**: `LegacyStencilAccessor` and the `center-hit` / + `legacy-branchless` passes switched from `DefaultReadAccessor` (3-level + cache) to a 1-level leaf-only cache. Upper/lower slots are never + consulted for `GetValue` workloads; the switch removes passive + bookkeeping and gives 4–9 % additional speedup on branchless paths. + Scope: benchmark-only; the library default is unchanged (right default + for `probeValue`/`probeLeaf`/mixed workloads). -- **Branchless `LeafNode::getValue(offset)` in NanoVDB** (§8j). - The single biggest available CPU-side speedup for any stencil caller. A - ~15-line rewrite that preserves the OFF-returns-0 semantics via an - arithmetic mask gate instead of a conditional `return 0` would give - Legacy, the hybrid, HaloStencilAccessor, and any future variant a 2–3× - end-to-end speedup on 32-thread WENO5 workloads. Needs benchmarking to - confirm GCC/Clang don't refold the gate back into a branch. +### Remaining - **`[[gnu::always_inline]]` on `Simd.h` helpers** (§8f) vs **`[[gnu::flatten]]` on StencilAccessor-style entry points** (§8h): diff --git a/nanovdb/nanovdb/util/StencilAccessor.md b/nanovdb/nanovdb/util/StencilAccessor.md index 17efc3f40e..74c1542e57 100644 --- a/nanovdb/nanovdb/util/StencilAccessor.md +++ b/nanovdb/nanovdb/util/StencilAccessor.md @@ -517,19 +517,24 @@ because both pay the same dominant `isOn` mispredict cost. - **The shipped hybrid design is the right API choice** (Simd-free public surface, compiler-portable) but its wall-clock edge over Legacy is marginal (~0.3 ns/voxel), not the ~3 ns/voxel originally implied. -- **The cheap architectural win is a branchless - `LeafNode::getValue(offset)` in NanoVDB** — ~15 lines - that would speed up every stencil gather caller (Legacy, hybrid, - HaloStencilAccessor, future variants) by ~3×. -- **HaloStencilAccessor's value proposition is validated**: its precomputed - uint64 index buffer naturally eliminates `isOn` branches by never evaluating - them. The speedup over branchless-leaf is smaller than previously - framed (~0.5–1 ns/voxel rather than sub-2 ns/voxel territory), but - still real. Worth building for the absolute-perf cases. - -See `BatchAccessor.md` §8j for the full measurement matrix, methodology, -correction log relative to §8g/§8h/§8i, and the branchless-experiment -source. +- **The cheap architectural win was a branchless variant of + `LeafData::getValue`**: `getValueBranchless`, shipped in + `NanoVDB.h:4161` (see `BatchAccessor.md` §8k). Opt-in expert path for + neighbourhood-aware cachers; end-to-end 1.4× on realistic narrow-band + workloads, 2.8× on random-access. +- **HaloStencilAccessor's value proposition is validated but narrower**: + its precomputed uint64 index buffer naturally eliminates `isOn` + branches by never evaluating them. Now that `getValueBranchless` + captures the same win cheaply, the halo's remaining advantage is + "zero per-tap work at query time" rather than "avoids the isOn + mispredict storm." Worth building for the absolute-perf cases; less + urgent than previously framed. + +See `BatchAccessor.md` §8j for the original measurement matrix and +correction log (§8g/§8h/§8i), and `BatchAccessor.md` §8k for the +follow-on that added `getValueBranchless`, the narrow-band validation +benchmark (`ex_narrowband_stencil_cpu`), and the leaf-only +`ReadAccessor` finding. --- From 64596959f6d1c9772902ad5d86ed946af19367df Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Sun, 19 Apr 2026 15:07:40 -0500 Subject: [PATCH 39/60] NanoVDB: LeafData::getValue branchless by default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fold the branchless reformulation (previously shipped as a sibling getValueBranchless in 8a24ddfd) into getValue as its default body, and gate the pre-2026 branchy form behind NANOVDB_USE_BRANCHY_GETVALUE. The branchless form is strictly faster or within ~0.1 ns/vox on every workload measured, and the OFF path is preserved bit-for-bit via the ternary-constant mask-AND -- checksums match. Update the two CPU stencil-gather examples to call the single getValue entry point, and refresh BatchAccessor.md §8k + StencilAccessor.md §8.2 to describe the toggle-based final API and the API evolution history. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Efty Sifakis --- nanovdb/nanovdb/NanoVDB.h | 32 +---- .../narrowband_stencil_cpu.cpp | 7 +- .../stencil_gather_cpu.cpp | 7 +- nanovdb/nanovdb/util/BatchAccessor.md | 133 ++++++++++-------- nanovdb/nanovdb/util/StencilAccessor.md | 27 ++-- 5 files changed, 105 insertions(+), 101 deletions(-) diff --git a/nanovdb/nanovdb/NanoVDB.h b/nanovdb/nanovdb/NanoVDB.h index e8a179d702..dc6deb6065 100644 --- a/nanovdb/nanovdb/NanoVDB.h +++ b/nanovdb/nanovdb/NanoVDB.h @@ -4136,44 +4136,26 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafDatahasStats() ? this->lastOffset() + 2u : 0u; } __hostdev__ uint64_t getAvg() const { return this->hasStats() ? this->lastOffset() + 3u : 0u; } __hostdev__ uint64_t getDev() const { return this->hasStats() ? this->lastOffset() + 4u : 0u; } + // Default branchless; define NANOVDB_USE_BRANCHY_GETVALUE to restore the + // pre-2026 branchy form. See BatchAccessor.md §8k for rationale. __hostdev__ uint64_t getValue(uint32_t i) const { - //return mValueMask.isOn(i) ? mOffset + mValueMask.countOn(i) : 0u;// for debugging +#ifdef NANOVDB_USE_BRANCHY_GETVALUE uint32_t n = i >> 6; const uint64_t w = BaseT::mValueMask.words()[n], mask = uint64_t(1) << (i & 63u); - if (!(w & mask)) return uint64_t(0); // if i'th value is inactive return offset to background value + if (!(w & mask)) return uint64_t(0); uint64_t sum = BaseT::mOffset + util::countOn(w & (mask - 1u)); if (n--) sum += BaseT::mPrefixSum >> (9u * n) & 511u; return sum; - } - - /// @brief Branchless variant of getValue, intended for neighbourhood-aware - /// caching paths (e.g. nanovdb::BatchAccessor's scalar tail) where the - /// per-tap mValueMask.isOn(offset) test is data-dependent and unpredictable. - /// - /// Semantics are identical to getValue: returns 0 for inactive voxels, - /// otherwise returns mOffset + prefix9(n) + popcount(w & (bit - 1u)). - /// Implementation uses a ternary-constant gate that the compiler emits as - /// `test + cmov` on x86, avoiding the branch-mispredict storm observed on - /// workloads with low spatial coherence between tap positions and the - /// source mValueMask. See BatchAccessor.md §8j for the perf-counter - /// investigation that motivated this variant. - __hostdev__ uint64_t getValueBranchless(uint32_t i) const - { +#else const uint32_t n = i >> 6; const uint64_t w = BaseT::mValueMask.words()[n]; const uint64_t bit = uint64_t(1) << (i & 63u); - // prefix9 extraction: predictable branch (n=0 only for the first - // x-slice; compilers emit as cmov). Kept as a ternary rather than - // unconditional because the prefix-sum shift would be UB at n=0. const uint64_t prefix = n == 0u ? uint64_t(0) : (BaseT::mPrefixSum >> (9u * (n - 1u))) & 511u; const uint64_t sum = BaseT::mOffset + prefix + util::countOn(w & (bit - 1u)); - // 0 for inactive voxels, all-ones mask for active. The ternary on - // two compile-time-constant arms compiles to test + cmov, not to a - // conditional jump -- the whole point of this method. - const uint64_t mask = (w & bit) ? ~uint64_t(0) : uint64_t(0); - return mask & sum; + return ((w & bit) ? ~uint64_t(0) : uint64_t(0)) & sum; +#endif } }; // LeafData diff --git a/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp b/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp index 5f5dd0aed3..87a52704fd 100644 --- a/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp +++ b/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp @@ -787,10 +787,9 @@ static void runPerf( const uint32_t offset = (uint32_t(c[0] & 7) << 6) | (uint32_t(c[1] & 7) << 3) | uint32_t(c[2] & 7); - // NanoVDB LeafData::getValueBranchless -- - // same formula as getValue but with the isOn check - // replaced by a cmov-style mask gate. - s += leaf->data()->getValueBranchless(offset); + // LeafData::getValue (branchless by default; + // see NanoVDB.h + BatchAccessor.md §8k). + s += leaf->data()->getValue(offset); }; // Unroll all 18 WENO5 taps via the compile-time tuple. diff --git a/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp b/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp index dc6e35e033..2590386a4a 100644 --- a/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp +++ b/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp @@ -693,10 +693,9 @@ static void runPerf( const uint32_t offset = (uint32_t(c[0] & 7) << 6) | (uint32_t(c[1] & 7) << 3) | uint32_t(c[2] & 7); - // NanoVDB LeafData::getValueBranchless -- - // same formula as getValue but with the isOn check - // replaced by a cmov-style mask gate. - s += leaf->data()->getValueBranchless(offset); + // LeafData::getValue (branchless by default; + // see NanoVDB.h + BatchAccessor.md §8k). + s += leaf->data()->getValue(offset); }; // Unroll all 18 WENO5 taps via the compile-time tuple. diff --git a/nanovdb/nanovdb/util/BatchAccessor.md b/nanovdb/nanovdb/util/BatchAccessor.md index 7b09661fec..b7cb4a1408 100644 --- a/nanovdb/nanovdb/util/BatchAccessor.md +++ b/nanovdb/nanovdb/util/BatchAccessor.md @@ -1283,12 +1283,13 @@ the perf numbers in this section change when it is toggled. | "`NANOVDB_USE_INTRINSICS` matters for popcount-heavy paths" | general assumption | No-op on GCC `-O3 -march=native`: SWAR → popcnt pattern match. Enable for portability anyway. | | "27-leaf cache is the architectural win of BatchAccessor" | §8i "Cost of the refactor" | Cache delta is ~0.3 ns/voxel. Real wins are the Simd-free API and flatten-free compiler portability (§8i). | -### 8k. Follow-up: `LeafData::getValueBranchless`, narrow-band validation, and accessor cache-level +### 8k. Follow-up: branchless `LeafData::getValue`, narrow-band validation, and accessor cache-level Follow-on to §8j. Three things happened: (1) the branchless reformulation of `leaf.getValue` was moved from a -hand-inlined benchmark hack into `NanoVDB.h` proper as a new method on -`LeafData`; +hand-inlined benchmark hack into `NanoVDB.h` proper and made the default +body of `LeafData::getValue`, gated by +`NANOVDB_USE_BRANCHY_GETVALUE` for the legacy form; (2) a second example (`ex_narrowband_stencil_cpu`) was added to validate the finding on a real narrow-band level set rather than a pathological random-occupancy synthetic; @@ -1296,44 +1297,66 @@ random-occupancy synthetic; `ReadAccessor` when only the leaf-level cache can actually contribute, and switched to `ReadAccessor`. -#### 8k.1 The new API: `LeafData::getValueBranchless` +#### 8k.1 The API change: branchless `getValue` by default, toggle for the old form -Located at `NanoVDB.h:4161`, sibling to the existing `getValue` at 4139. -Same signature, same inputs, bit-for-bit identical output: +`LeafData::getValue` (NanoVDB.h:~4140) is now a +preprocessor-toggled pair: the branchless form is the default; defining +`NANOVDB_USE_BRANCHY_GETVALUE` at compile time restores the pre-2026 +branchy implementation. ```cpp -__hostdev__ uint64_t getValueBranchless(uint32_t i) const +__hostdev__ uint64_t getValue(uint32_t i) const { +#ifdef NANOVDB_USE_BRANCHY_GETVALUE + uint32_t n = i >> 6; + const uint64_t w = BaseT::mValueMask.words()[n], mask = uint64_t(1) << (i & 63u); + if (!(w & mask)) return uint64_t(0); + uint64_t sum = BaseT::mOffset + util::countOn(w & (mask - 1u)); + if (n--) sum += BaseT::mPrefixSum >> (9u * n) & 511u; + return sum; +#else const uint32_t n = i >> 6; const uint64_t w = BaseT::mValueMask.words()[n]; const uint64_t bit = uint64_t(1) << (i & 63u); const uint64_t prefix = n == 0u ? uint64_t(0) - : (BaseT::mPrefixSum >> (9u*(n-1u))) & 511u; + : (BaseT::mPrefixSum >> (9u * (n - 1u))) & 511u; const uint64_t sum = BaseT::mOffset + prefix + util::countOn(w & (bit - 1u)); - const uint64_t mask = (w & bit) ? ~uint64_t(0) : uint64_t(0); - return mask & sum; + return ((w & bit) ? ~uint64_t(0) : uint64_t(0)) & sum; +#endif } ``` Key design points: -- Scoped to `LeafData` (not `LeafNode`) — opt-in expert path for - neighbourhood-aware cachers; the generic `LeafNode::getValue` and the - `ReadAccessor::getValue` chain are unchanged. -- The ternary `(w & bit) ? ~0ull : 0ull` compiles to `test + cmov` on - x86 (verified on GCC 13 / `-O3 -march=native`), eliminating the - mispredict-prone conditional-jump pattern of the original `getValue`. -- The prefix-extract ternary (`n == 0u ? 0 : …`) is kept as-is — its - outcome is 7:1 biased and the predictor handles it cleanly, so - expanding it to branchless arithmetic wouldn't help and would risk - tripping UB on the `n-1` shift for `n==0`. -- OFF voxels still return 0 (gated by the mask-AND at the end), so the - method is a drop-in replacement for `getValue`. **Checksum matches - byte-for-byte on all measured workloads.** -During the earlier investigation we'd used a hand-inlined variant that -skipped the gate — faster (~5% on single-thread), semantically wrong -(OFF voxels returned the formula's non-zero junk). The shipped method -includes the gate and is the correct drop-in. +- **Default is branchless**, so every caller of + `leaf->getValue(offset)` / `leaf->data()->getValue(offset)` / + `ReadAccessor::getValue(ijk)` inherits the speedup with no code + change. The `NANOVDB_USE_BRANCHY_GETVALUE` macro restores the old + behaviour for bisection, regression testing, or performance + comparison. +- The `(w & bit) ? ~0ull : 0ull` ternary compiles to `test + cmov` on + x86 (verified on GCC 13 at `-O3 -march=native`), eliminating the + mispredict-prone conditional-jump pattern of the branchy form. +- The prefix-extract ternary (`n == 0u ? 0 : ...`) is kept as-is — its + outcome is 7:1 biased, so the predictor handles it cleanly, and the + shift would be UB on `n-1` if `n==0`. +- OFF voxels still return 0 (gated by the mask-AND), so the output is + bit-for-bit identical to the old `getValue`. **Checksum matches + byte-for-byte on all measured workloads.** +- Scoped to `LeafData` — the only build type where the + original early-return guard introduced a data-dependent branch. Other + `LeafData` specializations are unchanged. + +**API evolution note.** During the investigation the branchless form +was first committed as a sibling method `getValueBranchless` (8a24ddfd) +so callers could opt in explicitly. After benchmarking confirmed the +branchless form is strictly faster or within ~0.1 ns/vox on every +workload measured — including cases where the branch is highly +predictable — the sibling was folded into `getValue` as the default, and +the macro toggle was added so the pre-2026 form stays reachable by +explicit opt-in. Early commit messages in this branch may still +reference `getValueBranchless`; the surviving API is the single +toggleable `getValue`. #### 8k.2 `ex_narrowband_stencil_cpu` — realistic workload benchmark @@ -1359,12 +1382,12 @@ FloatGrid with 31.8 M active voxels over a 1125×1081×762 bbox. Single P-core, `--threads=1`, PMU counters, `-O3 -march=native`: -| Variant | Workload | ns/voxel | IPC | branch-miss | L1 miss | -|---------------------|-------------|---------:|-----:|------------:|--------:| -| Legacy | narrow-band | 47.0 | 4.22 | 1.74 % | 0.06 % | -| `getValueBranchless`| narrow-band | **34.5** | **5.55** | **0.45 %** | 0.07 % | -| Legacy | synthetic | 106.1 | 1.96 | 8.07 % | 0.36 % | -| `getValueBranchless`| synthetic | **37.9** | **4.55** | **1.63 %** | 0.39 % | +| Variant | Workload | ns/voxel | IPC | branch-miss | L1 miss | +|--------------------|-------------|---------:|-----:|------------:|--------:| +| branchy | narrow-band | 47.0 | 4.22 | 1.74 % | 0.06 % | +| branchless (default) | narrow-band | **34.5** | **5.55** | **0.45 %** | 0.07 % | +| branchy | synthetic | 106.1 | 1.96 | 8.07 % | 0.36 % | +| branchless (default) | synthetic | **37.9** | **4.55** | **1.63 %** | 0.39 % | Two observations that refine §8j: @@ -1373,7 +1396,7 @@ Two observations that refine §8j: well enough that the original `getValue` runs at IPC ~4.2 (near peak for integer code). The isOn branch is only catastrophic when access patterns are genuinely unpredictable; narrow-band SDF walks aren't. -2. **`getValueBranchless` still wins on narrow-band** (47→34.5 ns/vox, +2. **Branchless still wins on narrow-band** (47→34.5 ns/vox, 1.4×) because the branch is still data-dependent even if mostly predictable — every ~1 in 60 calls costs ~15 cycles. On synthetic the benefit is much larger (2.8×) because there's a genuine @@ -1397,12 +1420,12 @@ Switching the scaffolding to `ReadAccessor` passes of both examples) removes those passive writes. Measured 32- thread wall-clock deltas: -| Workload, config | Legacy | `getValueBranchless` | -|---------------------------|--------------:|---------------------:| -| narrow-band, 8 P-cores | no change | 140.0 → 132.1 ms (−5.6 %) | -| narrow-band, 24 cores | no change | 66.1 → 60.3 ms (−8.8 %) | -| synthetic, 8 P-cores | no change | 80.8 → 76.8 ms (−5.0 %) | -| synthetic, 24 cores | no change | 35.8 → 34.3 ms (−4.2 %) | +| Workload, config | Legacy (branchy) | Legacy (branchless, default) | +|---------------------------|-----------------:|-----------------------------:| +| narrow-band, 8 P-cores | no change | 140.0 → 132.1 ms (−5.6 %) | +| narrow-band, 24 cores | no change | 66.1 → 60.3 ms (−8.8 %) | +| synthetic, 8 P-cores | no change | 80.8 → 76.8 ms (−5.0 %) | +| synthetic, 24 cores | no change | 35.8 → 34.3 ms (−4.2 %) | Legacy paths are backend-bound on mispredicts — the extra stores overlap for free in the stall cycles. The branchless paths run at @@ -1422,10 +1445,10 @@ mixed workloads; opt into 1-level only when you know the loop is 24-core Arrow Lake, full pipeline including decode: -| Workload | Legacy | `getValueBranchless` | Speedup | -|-----------------------------------|-------:|---------------------:|--------:| -| Narrow-band taperLER (31.8 M) | 85 ms | **60 ms** | 1.4 × | -| Synthetic random 50% (16.7 M) | 95 ms | **34 ms** | 2.8 × | +| Workload | branchy | branchless (default) | Speedup | +|-----------------------------------|--------:|---------------------:|--------:| +| Narrow-band taperLER (31.8 M) | 85 ms | **60 ms** | 1.4 × | +| Synthetic random 50% (16.7 M) | 95 ms | **34 ms** | 2.8 × | Speedup is thread-count-independent (same ratio across 8 P-cores and 24 cores). The two workloads' speedup *spread* — 1.4 × vs 2.8 × — @@ -1437,14 +1460,12 @@ The "Branchless `LeafNode::getValue`" item is complete (shipped at the `LeafData` level per the scope decision, with benchmark coverage on both synthetic and real narrow-band workloads). Future follow-ons implied by this work but not pursued here: -- A `ProbeValue::get` variant that reuses `getValueBranchless` and the - already-computed `(w & bit)` to eliminate the redundant second +- A `ProbeValue::get` refactor that reuses the already-computed + `(w & bit)` from `getValue` to eliminate the redundant second `isOn` test at NanoVDB.h:6302–6306. -- Steering-team proposal for the NanoVDB library: adopt - `getValueBranchless` as a public API (or possibly as the default for - `LeafData::getValue`, if the single-thread ~14 % - instruction-count increase is acceptable given its branchless - universal applicability). +- Steering-team pitch for making `NANOVDB_USE_BRANCHY_GETVALUE` a + legacy compatibility shim (to be retired after a deprecation window) + rather than a permanent toggle. --- @@ -1515,10 +1536,12 @@ follow-ons implied by this work but not pursued here: (SWAR `util::countOn` is pattern-matched to hardware `popcnt`). Enable it in the build anyway for portability. -- **`LeafData::getValueBranchless` in `NanoVDB.h` (§8k)**: - shipped. Branchless sibling to `getValue`; same semantics, `test+cmov` - gate instead of a conditional jump. Validated on both synthetic random - 50% (2.8× end-to-end speedup on 24 cores) and real narrow-band +- **Branchless `LeafData::getValue` in `NanoVDB.h` (§8k)**: + shipped. The default body of `getValue` is now the branchless form + (`test+cmov` gate instead of a conditional jump); defining + `NANOVDB_USE_BRANCHY_GETVALUE` at compile time restores the pre-2026 + branchy version. Same semantics either way. Validated on both synthetic + random 50% (2.8× end-to-end speedup on 24 cores) and real narrow-band `taperLER.vdb` (1.4× speedup). - **`ex_narrowband_stencil_cpu` (§8k.2)**: new `.vdb`-based benchmark diff --git a/nanovdb/nanovdb/util/StencilAccessor.md b/nanovdb/nanovdb/util/StencilAccessor.md index 74c1542e57..059ad3ec2b 100644 --- a/nanovdb/nanovdb/util/StencilAccessor.md +++ b/nanovdb/nanovdb/util/StencilAccessor.md @@ -517,24 +517,25 @@ because both pay the same dominant `isOn` mispredict cost. - **The shipped hybrid design is the right API choice** (Simd-free public surface, compiler-portable) but its wall-clock edge over Legacy is marginal (~0.3 ns/voxel), not the ~3 ns/voxel originally implied. -- **The cheap architectural win was a branchless variant of - `LeafData::getValue`**: `getValueBranchless`, shipped in - `NanoVDB.h:4161` (see `BatchAccessor.md` §8k). Opt-in expert path for - neighbourhood-aware cachers; end-to-end 1.4× on realistic narrow-band - workloads, 2.8× on random-access. +- **The cheap architectural win was a branchless reformulation of + `LeafData::getValue`**: shipped as the default body of + `getValue` in `NanoVDB.h` (see `BatchAccessor.md` §8k), gated by + `NANOVDB_USE_BRANCHY_GETVALUE` to restore the old branchy form. + End-to-end 1.4× on realistic narrow-band workloads, 2.8× on + random-access. - **HaloStencilAccessor's value proposition is validated but narrower**: its precomputed uint64 index buffer naturally eliminates `isOn` - branches by never evaluating them. Now that `getValueBranchless` - captures the same win cheaply, the halo's remaining advantage is - "zero per-tap work at query time" rather than "avoids the isOn - mispredict storm." Worth building for the absolute-perf cases; less - urgent than previously framed. + branches by never evaluating them. Now that the branchless + `getValue` captures the same win cheaply, the halo's remaining + advantage is "zero per-tap work at query time" rather than "avoids + the isOn mispredict storm." Worth building for the absolute-perf + cases; less urgent than previously framed. See `BatchAccessor.md` §8j for the original measurement matrix and correction log (§8g/§8h/§8i), and `BatchAccessor.md` §8k for the -follow-on that added `getValueBranchless`, the narrow-band validation -benchmark (`ex_narrowband_stencil_cpu`), and the leaf-only -`ReadAccessor` finding. +follow-on that made `getValue` branchless-by-default, added the +narrow-band validation benchmark (`ex_narrowband_stencil_cpu`), and +the leaf-only `ReadAccessor` finding. --- From 24c2de77c6e5b6d42fbf08f5ba745113e38a3876 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Mon, 20 Apr 2026 11:01:54 -0500 Subject: [PATCH 40/60] ex_*stencil_cpu: prune diagnostic passes, keep just the two shipped paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 8-pass decomposition battery (decode, stencil, degenerate, inleaf, framing, center-hit, legacy, legacy-branchless) was load-bearing while attributing Legacy's cost between tree-walks, leaf-local work, and isOn mispredicts (BatchAccessor.md §8g-§8k). That attribution is now settled and getValue is branchless by default, so the diagnostic variants no longer earn their keep. Retain four passes in both ex_stencil_gather_cpu and ex_narrowband_stencil_cpu: - decode : decodeInverseMaps cost baseline - framing : loop + coord compute + anti-DCE, no accessor call - stencil : hybrid StencilAccessor (SIMD cache + scalar getValue tail) - legacy : LegacyStencilAccessor (per-tap probeLeaf + getValue) Drop: DegenerateStencil / InLeafStencil types and their accessor aliases, the degenerate/inleaf/center-hit/legacy-branchless blocks, and the "Legacy cost decomposition" printout that consumed center-hit. Correctness unaffected (StencilAccessor vs LegacyStencilAccessor cross-validation still runs and passes); checksums match between the two shipped paths on both benchmarks. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Efty Sifakis --- .../narrowband_stencil_cpu.cpp | 313 +----------------- .../stencil_gather_cpu.cpp | 310 +---------------- 2 files changed, 15 insertions(+), 608 deletions(-) diff --git a/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp b/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp index 87a52704fd..e714c6bd91 100644 --- a/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp +++ b/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp @@ -81,55 +81,6 @@ using CPUVBM = nanovdb::tools::VoxelBlockManager; using SAccT = nanovdb::StencilAccessor; using LegacyAccT = nanovdb::LegacyStencilAccessor; -// Decomposition-only stencil: 18 taps all at (0,0,0). Measures the hybrid -// StencilAccessor's floor cost when no tap crosses a leaf boundary and every -// lookup hits the center leaf. Subtracting this from the Weno5 run isolates -// the cross-leaf overhead — BUT the 18 identical compile-time taps give the -// compiler a large CSE opportunity, biasing the number downward. -struct DegenerateStencil { - using Taps = std::tuple< - nanovdb::StencilPoint<0,0,0>, nanovdb::StencilPoint<0,0,0>, - nanovdb::StencilPoint<0,0,0>, nanovdb::StencilPoint<0,0,0>, - nanovdb::StencilPoint<0,0,0>, nanovdb::StencilPoint<0,0,0>, - nanovdb::StencilPoint<0,0,0>, nanovdb::StencilPoint<0,0,0>, - nanovdb::StencilPoint<0,0,0>, nanovdb::StencilPoint<0,0,0>, - nanovdb::StencilPoint<0,0,0>, nanovdb::StencilPoint<0,0,0>, - nanovdb::StencilPoint<0,0,0>, nanovdb::StencilPoint<0,0,0>, - nanovdb::StencilPoint<0,0,0>, nanovdb::StencilPoint<0,0,0>, - nanovdb::StencilPoint<0,0,0>, nanovdb::StencilPoint<0,0,0> - >; - // Empty Hull: prefetchHull becomes a no-op; center leaf is always cached - // by BatchAccessor's constructor / advance(). - using Hull = std::tuple<>; -}; -using DegAccT = nanovdb::StencilAccessor; - -// CSE-resistant in-leaf stencil: 18 distinct compile-time taps spanning the -// leaf's 8^3 footprint (all axes, 6 tap offsets in [0..6] per axis). Used -// via StencilAccessor::moveToInLeaf, which applies (voxel_local + tap) mod 8 -// to the center voxel — guaranteeing every tap accesses the center leaf -// while touching distinct mValueMask words across taps and across voxels. -// This isolates the hybrid's single-leaf floor without the CSE bias that -// DegenerateStencil suffers from. -struct InLeafStencil { - using Taps = std::tuple< - // x spans 0..6 (hits mValueMask words 0..6 depending on voxel's local x) - nanovdb::StencilPoint<0,0,0>, nanovdb::StencilPoint<1,0,0>, - nanovdb::StencilPoint<2,0,0>, nanovdb::StencilPoint<3,0,0>, - nanovdb::StencilPoint<4,0,0>, nanovdb::StencilPoint<5,0,0>, - // y spans 1..6 (different destY positions within a word) - nanovdb::StencilPoint<0,1,0>, nanovdb::StencilPoint<0,2,0>, - nanovdb::StencilPoint<0,3,0>, nanovdb::StencilPoint<0,4,0>, - nanovdb::StencilPoint<0,5,0>, nanovdb::StencilPoint<0,6,0>, - // z spans 1..6 - nanovdb::StencilPoint<0,0,1>, nanovdb::StencilPoint<0,0,2>, - nanovdb::StencilPoint<0,0,3>, nanovdb::StencilPoint<0,0,4>, - nanovdb::StencilPoint<0,0,5>, nanovdb::StencilPoint<0,0,6> - >; - using Hull = std::tuple<>; // moveToInLeaf skips prefetchHull entirely -}; -using InLeafAccT = nanovdb::StencilAccessor; - // ============================================================ // VDB file loading + sidecar harvest // ============================================================ @@ -405,8 +356,7 @@ static void runPerf( const std::string& passFilter = "all") { // wantPass() returns true if this pass should run under the current filter. - // Supported names: "decode", "stencil", "degenerate", "inleaf", "framing", - // "center-hit", "legacy". "all" runs everything. + // Supported names: "decode", "stencil", "framing", "legacy". "all" runs everything. auto wantPass = [&](const char* name) { return passFilter == "all" || passFilter == name; }; @@ -507,108 +457,9 @@ static void runPerf( [](uint64_t a, uint64_t b) { return a ^ b; }); } // end wantPass("stencil") - // ---- Hybrid floor: DegenerateStencil (18 taps all at (0,0,0)) ---- - double degenerateUs = 0.0; - uint64_t degenerateChecksum = 0; - if (wantPass("degenerate")) { - std::fill(sums.begin(), sums.end(), uint64_t(0)); - degenerateUs = timeForEach([&] { - nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), - [&](const nanovdb::util::Range1D& range) { - alignas(64) uint32_t leafIndex[BlockWidth]; - alignas(64) uint16_t voxelOffset[BlockWidth]; - - for (size_t bID = range.begin(); bID != range.end(); ++bID) { - CPUVBM::decodeInverseMaps( - grid, firstLeafID[bID], - &jumpMap[bID * CPUVBM::JumpMapLength], - firstOffset + bID * BlockWidth, - leafIndex, voxelOffset); - - int nExtraLeaves = 0; - for (int w = 0; w < CPUVBM::JumpMapLength; ++w) - nExtraLeaves += nanovdb::util::countOn( - jumpMap[bID * CPUVBM::JumpMapLength + w]); - - DegAccT degAcc(*grid, firstLeafID[bID], (uint32_t)nExtraLeaves); - uint64_t* bs = sums.data() + bID * BlockWidth; - - for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { - degAcc.moveTo(leafIndex + batchStart, voxelOffset + batchStart); - for (int i = 0; i < SIMDw; ++i) { - if (leafIndex[batchStart + i] == CPUVBM::UnusedLeafIndex) continue; - uint64_t s = 0; - for (int k = 0; k < DegAccT::size(); ++k) - s += degAcc.mIndices[k][i]; - bs[batchStart + i] = s; - } - } - } - }); - }); - degenerateChecksum = - std::accumulate(sums.begin(), sums.end(), uint64_t(0), - [](uint64_t a, uint64_t b) { return a ^ b; }); - } // end wantPass("degenerate") - - // ---- Hybrid floor (CSE-resistant): 18 distinct taps wrapped to center leaf ---- - double inLeafUs = 0.0; - uint64_t inLeafChecksum = 0; - if (wantPass("inleaf")) { - std::fill(sums.begin(), sums.end(), uint64_t(0)); - inLeafUs = timeForEach([&] { - nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), - [&](const nanovdb::util::Range1D& range) { - alignas(64) uint32_t leafIndex[BlockWidth]; - alignas(64) uint16_t voxelOffset[BlockWidth]; - - for (size_t bID = range.begin(); bID != range.end(); ++bID) { - CPUVBM::decodeInverseMaps( - grid, firstLeafID[bID], - &jumpMap[bID * CPUVBM::JumpMapLength], - firstOffset + bID * BlockWidth, - leafIndex, voxelOffset); - - int nExtraLeaves = 0; - for (int w = 0; w < CPUVBM::JumpMapLength; ++w) - nExtraLeaves += nanovdb::util::countOn( - jumpMap[bID * CPUVBM::JumpMapLength + w]); - - InLeafAccT inLeafAcc(*grid, firstLeafID[bID], (uint32_t)nExtraLeaves); - uint64_t* bs = sums.data() + bID * BlockWidth; - - for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { - inLeafAcc.moveToInLeaf( - leafIndex + batchStart, voxelOffset + batchStart); - for (int i = 0; i < SIMDw; ++i) { - if (leafIndex[batchStart + i] == CPUVBM::UnusedLeafIndex) continue; - uint64_t s = 0; - for (int k = 0; k < InLeafAccT::size(); ++k) - s += inLeafAcc.mIndices[k][i]; - bs[batchStart + i] = s; - } - } - } - }); - }); - inLeafChecksum = - std::accumulate(sums.begin(), sums.end(), uint64_t(0), - [](uint64_t a, uint64_t b) { return a ^ b; }); - } // end wantPass("inleaf") - - // ---- Legacy cost decomposition variants ---- - // (a) "framing only" — Legacy loop structure, no accessor call (anti-DCE writes use li+k). - // Measures: decodeInverseMaps + Coord compute + 18-iteration inner loop + anti-DCE store. - // (b) "center-hit only" — Legacy loop + 18× mAcc.getValue(center) instead of tap offsets. - // Always hits the ReadAccessor's leaf cache → no tree walk. - // Measures: framing + cache-query + leaf-local lookup (mValueMask + mPrefixSum + popcount). - // (c) "full" — the original LegacyStencilAccessor path. - // Measures: framing + cache-query + leaf-local lookup + tree-walk-on-miss. - // - // Tree-walk cost per voxel ≈ full − center-hit. - // Cache + leaf-lookup per voxel ≈ center-hit − framing. - // Framing per voxel ≈ framing. - + // ---- Legacy framing floor: loop structure + decode, no accessor call ---- + // Anti-DCE writes derive from Coord components. Subtracted from the legacy + // pass to expose the 18-tap cost proper. double framingUs = 0.0; if (wantPass("framing")) { std::fill(sums.begin(), sums.end(), uint64_t(0)); @@ -645,51 +496,6 @@ static void runPerf( }); } // end wantPass("framing") - double centerHitUs = 0.0; - if (wantPass("center-hit")) { - std::fill(sums.begin(), sums.end(), uint64_t(0)); - centerHitUs = timeForEach([&] { - nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), - [&](const nanovdb::util::Range1D& range) { - alignas(64) uint32_t leafIndex[BlockWidth]; - alignas(64) uint16_t voxelOffset[BlockWidth]; - // Leaf-only cache (levels 1/2 would never be consulted for - // GetValue — see NanoVDB.h:5387 — and would only pay passive - // bookkeeping on miss). See LegacyStencilAccessor.h for rationale. - nanovdb::ReadAccessor acc(grid->tree().root()); - uint64_t* bs0 = sums.data(); - - for (size_t bID = range.begin(); bID != range.end(); ++bID) { - CPUVBM::decodeInverseMaps( - grid, firstLeafID[bID], - &jumpMap[bID * CPUVBM::JumpMapLength], - firstOffset + bID * BlockWidth, - leafIndex, voxelOffset); - - uint64_t* bs = bs0 + bID * BlockWidth; - for (int i = 0; i < BlockWidth; ++i) { - if (leafIndex[i] == CPUVBM::UnusedLeafIndex) continue; - (void)voxelOffset[i]; // keep decode non-dead - const uint32_t li = leafIndex[i]; - const nanovdb::Coord cOrigin = firstLeaf[li].origin(); - // 18 distinct positions ALL within this leaf's 8^3 footprint - // — guarantees leaf-cache hit on every call, but each coord - // is unique so the compiler can't CSE the lookups. - // k in [0..17]: local (k&7, (k>>3)&1, 0) sweeps an 8x2x1 slab. - uint64_t s = 0; - for (int k = 0; k < LegacyAccT::size(); ++k) { - const nanovdb::Coord c = cOrigin - + nanovdb::Coord(k & 7, (k >> 3) & 1, 0); - s += static_cast(acc.getValue(c)); - } - bs[i] = s; - } - } - }); - }); - - } // end wantPass("center-hit") - // ---- LegacyStencilAccessor ---- double legacyUs = 0.0; uint64_t legacyChecksum = 0; @@ -733,121 +539,19 @@ static void runPerf( [](uint64_t a, uint64_t b) { return a ^ b; }); } // end wantPass("legacy") - // ---- Legacy branchless: same as legacy but skip the leaf.getValue isOn branch ---- - // Replaces `leaf.getValue(offset)` (which branches on valueMask.isOn(offset)) - // with the unconditional formula: - // mOffset + prefix9(wordIdx) + popcount(maskWord & ((1<); - - legacyBranchlessUs = timeForEach([&] { - nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), - [&](const nanovdb::util::Range1D& range) { - alignas(64) uint32_t leafIndex[BlockWidth]; - alignas(64) uint16_t voxelOffset[BlockWidth]; - // Leaf-only cache (levels 1/2 would never be consulted for - // GetValue — see NanoVDB.h:5387 — and would only pay passive - // bookkeeping on miss). See LegacyStencilAccessor.h for rationale. - nanovdb::ReadAccessor acc(grid->tree().root()); - uint64_t* bs0 = sums.data(); - - for (size_t bID = range.begin(); bID != range.end(); ++bID) { - CPUVBM::decodeInverseMaps( - grid, firstLeafID[bID], - &jumpMap[bID * CPUVBM::JumpMapLength], - firstOffset + bID * BlockWidth, - leafIndex, voxelOffset); - - uint64_t* bs = bs0 + bID * BlockWidth; - - for (int i = 0; i < BlockWidth; ++i) { - if (leafIndex[i] == CPUVBM::UnusedLeafIndex) continue; - const uint16_t vo = voxelOffset[i]; - const uint32_t li = leafIndex[i]; - const nanovdb::Coord cOrigin = firstLeaf[li].origin(); - const int lx = (vo >> 6) & 7, ly = (vo >> 3) & 7, lz = vo & 7; - const nanovdb::Coord center = cOrigin + nanovdb::Coord(lx, ly, lz); - - uint64_t s = 0; - auto addTap = [&](int di, int dj, int dk) { - const nanovdb::Coord c = center + nanovdb::Coord(di, dj, dk); - const LeafT* leaf = acc.probeLeaf(c); - if (!leaf) return; // tap outside narrow band (predictable branch - // for active-region voxels) - const uint32_t offset = (uint32_t(c[0] & 7) << 6) - | (uint32_t(c[1] & 7) << 3) - | uint32_t(c[2] & 7); - // LeafData::getValue (branchless by default; - // see NanoVDB.h + BatchAccessor.md §8k). - s += leaf->data()->getValue(offset); - }; - - // Unroll all 18 WENO5 taps via the compile-time tuple. - [&](std::index_sequence) { - (addTap( - std::tuple_element_t::di, - std::tuple_element_t::dj, - std::tuple_element_t::dk - ), ...); - }(std::make_index_sequence{}); - - bs[i] = s; - } - } - }); - }); - - legacyBranchlessChecksum = - std::accumulate(sums.begin(), sums.end(), uint64_t(0), - [](uint64_t a, uint64_t b) { return a ^ b; }); - } // end wantPass("legacy-branchless") - std::printf("\nEnd-to-end stencil gather (%u blocks, %lu active voxels):\n", nBlocks, nVoxels); std::printf(" decodeInverseMaps only: %7.1f ms (%5.1f ns/voxel)\n", decodeUs / 1e3, decodeUs * 1e3 / double(nVoxels)); + std::printf(" Framing (no accessor) : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode]\n", + framingUs / 1e3, framingUs * 1e3 / double(nVoxels), + (framingUs - decodeUs) / 1e3); std::printf(" StencilAccessor : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", stencilUs / 1e3, stencilUs * 1e3 / double(nVoxels), (stencilUs - decodeUs) / 1e3, stencilChecksum); - std::printf(" Degenerate (18×center): %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", - degenerateUs / 1e3, degenerateUs * 1e3 / double(nVoxels), - (degenerateUs - decodeUs) / 1e3, degenerateChecksum); - std::printf(" InLeaf (18 distinct) : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", - inLeafUs / 1e3, inLeafUs * 1e3 / double(nVoxels), - (inLeafUs - decodeUs) / 1e3, inLeafChecksum); std::printf(" LegacyStencilAccessor : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", legacyUs / 1e3, legacyUs * 1e3 / double(nVoxels), (legacyUs - decodeUs) / 1e3, legacyChecksum); - std::printf(" Legacy branchless : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", - legacyBranchlessUs / 1e3, legacyBranchlessUs * 1e3 / double(nVoxels), - (legacyBranchlessUs - decodeUs) / 1e3, legacyBranchlessChecksum); - - // Decomposition of LegacyStencilAccessor's ns/voxel: - // framing = no accessor call - // cache + leaf = centerHit − framing (per 18 taps) - // tree walk = legacy − centerHit (per 18 taps; amortises over ~25% miss rate) - const double framingNs = framingUs * 1e3 / double(nVoxels); - const double centerHitNs = centerHitUs * 1e3 / double(nVoxels); - const double legacyNs = legacyUs * 1e3 / double(nVoxels); - std::printf("\nLegacy cost decomposition (18 taps/voxel):\n"); - std::printf(" framing only : %7.1f ms (%5.1f ns/voxel)\n", - framingUs / 1e3, framingNs); - std::printf(" + center-hit × 18 : %7.1f ms (%5.1f ns/voxel) [cache+leaf = %5.2f ns/vox = %4.2f ns/tap]\n", - centerHitUs / 1e3, centerHitNs, - centerHitNs - framingNs, (centerHitNs - framingNs) / 18.0); - std::printf(" + stencil × 18 (full): %7.1f ms (%5.1f ns/voxel) [tree walk = %5.2f ns/vox = %4.2f ns/tap]\n", - legacyUs / 1e3, legacyNs, - legacyNs - centerHitNs, (legacyNs - centerHitNs) / 18.0); if (stencilChecksum != legacyChecksum) std::cerr << " WARNING: checksums differ — accessor results disagree!\n"; @@ -870,8 +574,7 @@ static void printUsage(const char* argv0) << " --grid= Select grid by name (default: first FloatGrid)\n" << " --pass= Run one perf pass:\n" << " all (default), verify, decode, stencil,\n" - << " degenerate, inleaf, framing, center-hit,\n" - << " legacy, legacy-branchless\n" + << " framing, legacy\n" << " --threads= Limit TBB parallelism (0 = TBB default)\n" << " --skip-validation Skip the sidecar ordering sanity check\n"; } diff --git a/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp b/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp index 2590386a4a..37bcbc642b 100644 --- a/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp +++ b/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp @@ -71,55 +71,6 @@ using CPUVBM = nanovdb::tools::VoxelBlockManager; using SAccT = nanovdb::StencilAccessor; using LegacyAccT = nanovdb::LegacyStencilAccessor; -// Decomposition-only stencil: 18 taps all at (0,0,0). Measures the hybrid -// StencilAccessor's floor cost when no tap crosses a leaf boundary and every -// lookup hits the center leaf. Subtracting this from the Weno5 run isolates -// the cross-leaf overhead — BUT the 18 identical compile-time taps give the -// compiler a large CSE opportunity, biasing the number downward. -struct DegenerateStencil { - using Taps = std::tuple< - nanovdb::StencilPoint<0,0,0>, nanovdb::StencilPoint<0,0,0>, - nanovdb::StencilPoint<0,0,0>, nanovdb::StencilPoint<0,0,0>, - nanovdb::StencilPoint<0,0,0>, nanovdb::StencilPoint<0,0,0>, - nanovdb::StencilPoint<0,0,0>, nanovdb::StencilPoint<0,0,0>, - nanovdb::StencilPoint<0,0,0>, nanovdb::StencilPoint<0,0,0>, - nanovdb::StencilPoint<0,0,0>, nanovdb::StencilPoint<0,0,0>, - nanovdb::StencilPoint<0,0,0>, nanovdb::StencilPoint<0,0,0>, - nanovdb::StencilPoint<0,0,0>, nanovdb::StencilPoint<0,0,0>, - nanovdb::StencilPoint<0,0,0>, nanovdb::StencilPoint<0,0,0> - >; - // Empty Hull: prefetchHull becomes a no-op; center leaf is always cached - // by BatchAccessor's constructor / advance(). - using Hull = std::tuple<>; -}; -using DegAccT = nanovdb::StencilAccessor; - -// CSE-resistant in-leaf stencil: 18 distinct compile-time taps spanning the -// leaf's 8^3 footprint (all axes, 6 tap offsets in [0..6] per axis). Used -// via StencilAccessor::moveToInLeaf, which applies (voxel_local + tap) mod 8 -// to the center voxel — guaranteeing every tap accesses the center leaf -// while touching distinct mValueMask words across taps and across voxels. -// This isolates the hybrid's single-leaf floor without the CSE bias that -// DegenerateStencil suffers from. -struct InLeafStencil { - using Taps = std::tuple< - // x spans 0..6 (hits mValueMask words 0..6 depending on voxel's local x) - nanovdb::StencilPoint<0,0,0>, nanovdb::StencilPoint<1,0,0>, - nanovdb::StencilPoint<2,0,0>, nanovdb::StencilPoint<3,0,0>, - nanovdb::StencilPoint<4,0,0>, nanovdb::StencilPoint<5,0,0>, - // y spans 1..6 (different destY positions within a word) - nanovdb::StencilPoint<0,1,0>, nanovdb::StencilPoint<0,2,0>, - nanovdb::StencilPoint<0,3,0>, nanovdb::StencilPoint<0,4,0>, - nanovdb::StencilPoint<0,5,0>, nanovdb::StencilPoint<0,6,0>, - // z spans 1..6 - nanovdb::StencilPoint<0,0,1>, nanovdb::StencilPoint<0,0,2>, - nanovdb::StencilPoint<0,0,3>, nanovdb::StencilPoint<0,0,4>, - nanovdb::StencilPoint<0,0,5>, nanovdb::StencilPoint<0,0,6> - >; - using Hull = std::tuple<>; // moveToInLeaf skips prefetchHull entirely -}; -using InLeafAccT = nanovdb::StencilAccessor; - // ============================================================ // Test domain generation (mirrors vbm_host_cuda.cpp) // ============================================================ @@ -311,8 +262,7 @@ static void runPerf( const std::string& passFilter = "all") { // wantPass() returns true if this pass should run under the current filter. - // Supported names: "decode", "stencil", "degenerate", "inleaf", "framing", - // "center-hit", "legacy". "all" runs everything. + // Supported names: "decode", "stencil", "framing", "legacy". "all" runs everything. auto wantPass = [&](const char* name) { return passFilter == "all" || passFilter == name; }; @@ -413,108 +363,9 @@ static void runPerf( [](uint64_t a, uint64_t b) { return a ^ b; }); } // end wantPass("stencil") - // ---- Hybrid floor: DegenerateStencil (18 taps all at (0,0,0)) ---- - double degenerateUs = 0.0; - uint64_t degenerateChecksum = 0; - if (wantPass("degenerate")) { - std::fill(sums.begin(), sums.end(), uint64_t(0)); - degenerateUs = timeForEach([&] { - nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), - [&](const nanovdb::util::Range1D& range) { - alignas(64) uint32_t leafIndex[BlockWidth]; - alignas(64) uint16_t voxelOffset[BlockWidth]; - - for (size_t bID = range.begin(); bID != range.end(); ++bID) { - CPUVBM::decodeInverseMaps( - grid, firstLeafID[bID], - &jumpMap[bID * CPUVBM::JumpMapLength], - firstOffset + bID * BlockWidth, - leafIndex, voxelOffset); - - int nExtraLeaves = 0; - for (int w = 0; w < CPUVBM::JumpMapLength; ++w) - nExtraLeaves += nanovdb::util::countOn( - jumpMap[bID * CPUVBM::JumpMapLength + w]); - - DegAccT degAcc(*grid, firstLeafID[bID], (uint32_t)nExtraLeaves); - uint64_t* bs = sums.data() + bID * BlockWidth; - - for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { - degAcc.moveTo(leafIndex + batchStart, voxelOffset + batchStart); - for (int i = 0; i < SIMDw; ++i) { - if (leafIndex[batchStart + i] == CPUVBM::UnusedLeafIndex) continue; - uint64_t s = 0; - for (int k = 0; k < DegAccT::size(); ++k) - s += degAcc.mIndices[k][i]; - bs[batchStart + i] = s; - } - } - } - }); - }); - degenerateChecksum = - std::accumulate(sums.begin(), sums.end(), uint64_t(0), - [](uint64_t a, uint64_t b) { return a ^ b; }); - } // end wantPass("degenerate") - - // ---- Hybrid floor (CSE-resistant): 18 distinct taps wrapped to center leaf ---- - double inLeafUs = 0.0; - uint64_t inLeafChecksum = 0; - if (wantPass("inleaf")) { - std::fill(sums.begin(), sums.end(), uint64_t(0)); - inLeafUs = timeForEach([&] { - nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), - [&](const nanovdb::util::Range1D& range) { - alignas(64) uint32_t leafIndex[BlockWidth]; - alignas(64) uint16_t voxelOffset[BlockWidth]; - - for (size_t bID = range.begin(); bID != range.end(); ++bID) { - CPUVBM::decodeInverseMaps( - grid, firstLeafID[bID], - &jumpMap[bID * CPUVBM::JumpMapLength], - firstOffset + bID * BlockWidth, - leafIndex, voxelOffset); - - int nExtraLeaves = 0; - for (int w = 0; w < CPUVBM::JumpMapLength; ++w) - nExtraLeaves += nanovdb::util::countOn( - jumpMap[bID * CPUVBM::JumpMapLength + w]); - - InLeafAccT inLeafAcc(*grid, firstLeafID[bID], (uint32_t)nExtraLeaves); - uint64_t* bs = sums.data() + bID * BlockWidth; - - for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { - inLeafAcc.moveToInLeaf( - leafIndex + batchStart, voxelOffset + batchStart); - for (int i = 0; i < SIMDw; ++i) { - if (leafIndex[batchStart + i] == CPUVBM::UnusedLeafIndex) continue; - uint64_t s = 0; - for (int k = 0; k < InLeafAccT::size(); ++k) - s += inLeafAcc.mIndices[k][i]; - bs[batchStart + i] = s; - } - } - } - }); - }); - inLeafChecksum = - std::accumulate(sums.begin(), sums.end(), uint64_t(0), - [](uint64_t a, uint64_t b) { return a ^ b; }); - } // end wantPass("inleaf") - - // ---- Legacy cost decomposition variants ---- - // (a) "framing only" — Legacy loop structure, no accessor call (anti-DCE writes use li+k). - // Measures: decodeInverseMaps + Coord compute + 18-iteration inner loop + anti-DCE store. - // (b) "center-hit only" — Legacy loop + 18× mAcc.getValue(center) instead of tap offsets. - // Always hits the ReadAccessor's leaf cache → no tree walk. - // Measures: framing + cache-query + leaf-local lookup (mValueMask + mPrefixSum + popcount). - // (c) "full" — the original LegacyStencilAccessor path. - // Measures: framing + cache-query + leaf-local lookup + tree-walk-on-miss. - // - // Tree-walk cost per voxel ≈ full − center-hit. - // Cache + leaf-lookup per voxel ≈ center-hit − framing. - // Framing per voxel ≈ framing. - + // ---- Legacy framing floor: loop structure + decode, no accessor call ---- + // Anti-DCE writes derive from Coord components. Subtracted from the legacy + // pass to expose the 18-tap cost proper. double framingUs = 0.0; if (wantPass("framing")) { std::fill(sums.begin(), sums.end(), uint64_t(0)); @@ -551,51 +402,6 @@ static void runPerf( }); } // end wantPass("framing") - double centerHitUs = 0.0; - if (wantPass("center-hit")) { - std::fill(sums.begin(), sums.end(), uint64_t(0)); - centerHitUs = timeForEach([&] { - nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), - [&](const nanovdb::util::Range1D& range) { - alignas(64) uint32_t leafIndex[BlockWidth]; - alignas(64) uint16_t voxelOffset[BlockWidth]; - // Leaf-only cache (levels 1/2 would never be consulted for - // GetValue — see NanoVDB.h:5387 — and would only pay passive - // bookkeeping on miss). See LegacyStencilAccessor.h for rationale. - nanovdb::ReadAccessor acc(grid->tree().root()); - uint64_t* bs0 = sums.data(); - - for (size_t bID = range.begin(); bID != range.end(); ++bID) { - CPUVBM::decodeInverseMaps( - grid, firstLeafID[bID], - &jumpMap[bID * CPUVBM::JumpMapLength], - firstOffset + bID * BlockWidth, - leafIndex, voxelOffset); - - uint64_t* bs = bs0 + bID * BlockWidth; - for (int i = 0; i < BlockWidth; ++i) { - if (leafIndex[i] == CPUVBM::UnusedLeafIndex) continue; - (void)voxelOffset[i]; // keep decode non-dead - const uint32_t li = leafIndex[i]; - const nanovdb::Coord cOrigin = firstLeaf[li].origin(); - // 18 distinct positions ALL within this leaf's 8^3 footprint - // — guarantees leaf-cache hit on every call, but each coord - // is unique so the compiler can't CSE the lookups. - // k in [0..17]: local (k&7, (k>>3)&1, 0) sweeps an 8x2x1 slab. - uint64_t s = 0; - for (int k = 0; k < LegacyAccT::size(); ++k) { - const nanovdb::Coord c = cOrigin - + nanovdb::Coord(k & 7, (k >> 3) & 1, 0); - s += static_cast(acc.getValue(c)); - } - bs[i] = s; - } - } - }); - }); - - } // end wantPass("center-hit") - // ---- LegacyStencilAccessor ---- double legacyUs = 0.0; uint64_t legacyChecksum = 0; @@ -639,121 +445,19 @@ static void runPerf( [](uint64_t a, uint64_t b) { return a ^ b; }); } // end wantPass("legacy") - // ---- Legacy branchless: same as legacy but skip the leaf.getValue isOn branch ---- - // Replaces `leaf.getValue(offset)` (which branches on valueMask.isOn(offset)) - // with the unconditional formula: - // mOffset + prefix9(wordIdx) + popcount(maskWord & ((1<); - - legacyBranchlessUs = timeForEach([&] { - nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), - [&](const nanovdb::util::Range1D& range) { - alignas(64) uint32_t leafIndex[BlockWidth]; - alignas(64) uint16_t voxelOffset[BlockWidth]; - // Leaf-only cache (levels 1/2 would never be consulted for - // GetValue — see NanoVDB.h:5387 — and would only pay passive - // bookkeeping on miss). See LegacyStencilAccessor.h for rationale. - nanovdb::ReadAccessor acc(grid->tree().root()); - uint64_t* bs0 = sums.data(); - - for (size_t bID = range.begin(); bID != range.end(); ++bID) { - CPUVBM::decodeInverseMaps( - grid, firstLeafID[bID], - &jumpMap[bID * CPUVBM::JumpMapLength], - firstOffset + bID * BlockWidth, - leafIndex, voxelOffset); - - uint64_t* bs = bs0 + bID * BlockWidth; - - for (int i = 0; i < BlockWidth; ++i) { - if (leafIndex[i] == CPUVBM::UnusedLeafIndex) continue; - const uint16_t vo = voxelOffset[i]; - const uint32_t li = leafIndex[i]; - const nanovdb::Coord cOrigin = firstLeaf[li].origin(); - const int lx = (vo >> 6) & 7, ly = (vo >> 3) & 7, lz = vo & 7; - const nanovdb::Coord center = cOrigin + nanovdb::Coord(lx, ly, lz); - - uint64_t s = 0; - auto addTap = [&](int di, int dj, int dk) { - const nanovdb::Coord c = center + nanovdb::Coord(di, dj, dk); - const LeafT* leaf = acc.probeLeaf(c); - if (!leaf) return; // tap outside narrow band (predictable branch - // for active-region voxels) - const uint32_t offset = (uint32_t(c[0] & 7) << 6) - | (uint32_t(c[1] & 7) << 3) - | uint32_t(c[2] & 7); - // LeafData::getValue (branchless by default; - // see NanoVDB.h + BatchAccessor.md §8k). - s += leaf->data()->getValue(offset); - }; - - // Unroll all 18 WENO5 taps via the compile-time tuple. - [&](std::index_sequence) { - (addTap( - std::tuple_element_t::di, - std::tuple_element_t::dj, - std::tuple_element_t::dk - ), ...); - }(std::make_index_sequence{}); - - bs[i] = s; - } - } - }); - }); - - legacyBranchlessChecksum = - std::accumulate(sums.begin(), sums.end(), uint64_t(0), - [](uint64_t a, uint64_t b) { return a ^ b; }); - } // end wantPass("legacy-branchless") - std::printf("\nEnd-to-end stencil gather (%u blocks, %lu active voxels):\n", nBlocks, nVoxels); std::printf(" decodeInverseMaps only: %7.1f ms (%5.1f ns/voxel)\n", decodeUs / 1e3, decodeUs * 1e3 / double(nVoxels)); + std::printf(" Framing (no accessor) : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode]\n", + framingUs / 1e3, framingUs * 1e3 / double(nVoxels), + (framingUs - decodeUs) / 1e3); std::printf(" StencilAccessor : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", stencilUs / 1e3, stencilUs * 1e3 / double(nVoxels), (stencilUs - decodeUs) / 1e3, stencilChecksum); - std::printf(" Degenerate (18×center): %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", - degenerateUs / 1e3, degenerateUs * 1e3 / double(nVoxels), - (degenerateUs - decodeUs) / 1e3, degenerateChecksum); - std::printf(" InLeaf (18 distinct) : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", - inLeafUs / 1e3, inLeafUs * 1e3 / double(nVoxels), - (inLeafUs - decodeUs) / 1e3, inLeafChecksum); std::printf(" LegacyStencilAccessor : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", legacyUs / 1e3, legacyUs * 1e3 / double(nVoxels), (legacyUs - decodeUs) / 1e3, legacyChecksum); - std::printf(" Legacy branchless : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", - legacyBranchlessUs / 1e3, legacyBranchlessUs * 1e3 / double(nVoxels), - (legacyBranchlessUs - decodeUs) / 1e3, legacyBranchlessChecksum); - - // Decomposition of LegacyStencilAccessor's ns/voxel: - // framing = no accessor call - // cache + leaf = centerHit − framing (per 18 taps) - // tree walk = legacy − centerHit (per 18 taps; amortises over ~25% miss rate) - const double framingNs = framingUs * 1e3 / double(nVoxels); - const double centerHitNs = centerHitUs * 1e3 / double(nVoxels); - const double legacyNs = legacyUs * 1e3 / double(nVoxels); - std::printf("\nLegacy cost decomposition (18 taps/voxel):\n"); - std::printf(" framing only : %7.1f ms (%5.1f ns/voxel)\n", - framingUs / 1e3, framingNs); - std::printf(" + center-hit × 18 : %7.1f ms (%5.1f ns/voxel) [cache+leaf = %5.2f ns/vox = %4.2f ns/tap]\n", - centerHitUs / 1e3, centerHitNs, - centerHitNs - framingNs, (centerHitNs - framingNs) / 18.0); - std::printf(" + stencil × 18 (full): %7.1f ms (%5.1f ns/voxel) [tree walk = %5.2f ns/vox = %4.2f ns/tap]\n", - legacyUs / 1e3, legacyNs, - legacyNs - centerHitNs, (legacyNs - centerHitNs) / 18.0); if (stencilChecksum != legacyChecksum) std::cerr << " WARNING: checksums differ — accessor results disagree!\n"; From f3b53edd2f2bc7147464fa6d6359bacd03eea5a4 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Mon, 20 Apr 2026 11:43:28 -0500 Subject: [PATCH 41/60] =?UTF-8?q?ex=5F*stencil=5Fcpu:=20add=20legacy-trans?= =?UTF-8?q?posed=20pass;=20BatchAccessor:=20document=20=C2=A78l?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Added a tap-outer, voxel-inner variant of the Legacy path (same leaf-only ReadAccessor, same probeLeaf + getValue mechanics, just the nested loops swapped) as a new `legacy-transposed` benchmark pass in both examples. Checksums match the voxel-outer `legacy` pass byte- for-byte on both synthetic and narrow-band workloads. During the experiment we hit a GCC inlining pitfall: a runtime-args inner lambda `[&](int di, int dj, int dk)` invoked 18 times via parameter-pack fold did *not* get inlined (lambda body contains a 128-iteration loop; GCC's inline budget × 18 is exhausted). Result: 18 explicit call instructions to a 542-byte processTap function with 6-register prologue/epilogue per call, plus tap offsets becoming runtime register arguments (one spilled to stack) — accounting for ~13 % of the observed slowdown vs. Legacy. Fix is a templated lambda `[&]() [[gnu::always_inline]]` dispatched via `.template operator()<...>()`. Standalone processTap symbol vanishes; transposed body grows 4.4 → 9.8 KB, matching Legacy's 10.5 KB. Measured at ~32M active voxels on i9-285K (24 threads): - Narrowband taperLER: Legacy 2.2 ns/vox vs Transposed 2.1 ns/vox (marginal, within the ~10 % noise floor) - Synthetic 64M/50%: Legacy 2.4 ns/vox vs Transposed 2.8 ns/vox (+19 %, outside noise) Implementation verdict: LegacyStencilAccessor's voxel-outer moveTo stays the default. Tap-outer has no consistent perf advantage, and voxel-outer wins on cleanliness (self-contained accessor, no scratch arrays, no compiler-inlining fragility, 1:1 mapping to the stencil- operator mental model). `legacy-transposed` kept as a benchmark pass for future reference. BatchAccessor.md §8l captures the experiment, the inlining-pitfall lesson, the measurement matrix, and the implementation-quality rationale behind keeping voxel-outer as the production default. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Efty Sifakis --- .../narrowband_stencil_cpu.cpp | 85 ++++++++++++++++++- .../stencil_gather_cpu.cpp | 83 +++++++++++++++++- nanovdb/nanovdb/util/BatchAccessor.md | 75 ++++++++++++++++ 3 files changed, 238 insertions(+), 5 deletions(-) diff --git a/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp b/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp index e714c6bd91..688d10ae62 100644 --- a/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp +++ b/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp @@ -356,7 +356,8 @@ static void runPerf( const std::string& passFilter = "all") { // wantPass() returns true if this pass should run under the current filter. - // Supported names: "decode", "stencil", "framing", "legacy". "all" runs everything. + // Supported names: "decode", "stencil", "framing", "legacy", + // "legacy-transposed". "all" runs everything. auto wantPass = [&](const char* name) { return passFilter == "all" || passFilter == name; }; @@ -539,6 +540,79 @@ static void runPerf( [](uint64_t a, uint64_t b) { return a ^ b; }); } // end wantPass("legacy") + // ---- Legacy transposed: tap-outer, voxel-inner ---- + // Same semantics as `legacy`, reordered. For each of the 18 WENO5 taps, + // sweep all BlockWidth voxels — giving long runs of probeLeaf + getValue + // calls with the SAME compile-time tap offset but varying center voxels. + double legacyXposedUs = 0.0; + uint64_t legacyXposedChecksum = 0; + if (wantPass("legacy-transposed")) { + std::fill(sums.begin(), sums.end(), uint64_t(0)); + + using Weno5Taps = nanovdb::Weno5Stencil::Taps; + static constexpr int SIZE = int(std::tuple_size_v); + + legacyXposedUs = timeForEach([&] { + nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), + [&](const nanovdb::util::Range1D& range) { + alignas(64) uint32_t leafIndex[BlockWidth]; + alignas(64) uint16_t voxelOffset[BlockWidth]; + alignas(64) nanovdb::Coord centers[BlockWidth]; + alignas(64) uint64_t s[BlockWidth]; + nanovdb::ReadAccessor acc(grid->tree().root()); + uint64_t* bs0 = sums.data(); + + for (size_t bID = range.begin(); bID != range.end(); ++bID) { + CPUVBM::decodeInverseMaps( + grid, firstLeafID[bID], + &jumpMap[bID * CPUVBM::JumpMapLength], + firstOffset + bID * BlockWidth, + leafIndex, voxelOffset); + + for (int i = 0; i < BlockWidth; ++i) { + s[i] = 0; + if (leafIndex[i] == CPUVBM::UnusedLeafIndex) continue; + const uint16_t vo = voxelOffset[i]; + const uint32_t li = leafIndex[i]; + const nanovdb::Coord cOrigin = firstLeaf[li].origin(); + centers[i] = cOrigin + nanovdb::Coord( + (vo >> 6) & 7, (vo >> 3) & 7, vo & 7); + } + + auto processTap = [&]() + [[gnu::always_inline]] + { + for (int i = 0; i < BlockWidth; ++i) { + if (leafIndex[i] == CPUVBM::UnusedLeafIndex) continue; + const nanovdb::Coord c = centers[i] + + nanovdb::Coord(DI, DJ, DK); + const LeafT* leaf = acc.probeLeaf(c); + if (!leaf) continue; + const uint32_t offset = (uint32_t(c[0] & 7) << 6) + | (uint32_t(c[1] & 7) << 3) + | uint32_t(c[2] & 7); + s[i] += leaf->data()->getValue(offset); + } + }; + + [&](std::index_sequence) { + (processTap.template operator()< + std::tuple_element_t::di, + std::tuple_element_t::dj, + std::tuple_element_t::dk>(), ...); + }(std::make_index_sequence{}); + + uint64_t* bs = bs0 + bID * BlockWidth; + for (int i = 0; i < BlockWidth; ++i) bs[i] = s[i]; + } + }); + }); + + legacyXposedChecksum = + std::accumulate(sums.begin(), sums.end(), uint64_t(0), + [](uint64_t a, uint64_t b) { return a ^ b; }); + } // end wantPass("legacy-transposed") + std::printf("\nEnd-to-end stencil gather (%u blocks, %lu active voxels):\n", nBlocks, nVoxels); std::printf(" decodeInverseMaps only: %7.1f ms (%5.1f ns/voxel)\n", @@ -552,9 +626,14 @@ static void runPerf( std::printf(" LegacyStencilAccessor : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", legacyUs / 1e3, legacyUs * 1e3 / double(nVoxels), (legacyUs - decodeUs) / 1e3, legacyChecksum); + std::printf(" Legacy transposed : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", + legacyXposedUs / 1e3, legacyXposedUs * 1e3 / double(nVoxels), + (legacyXposedUs - decodeUs) / 1e3, legacyXposedChecksum); if (stencilChecksum != legacyChecksum) - std::cerr << " WARNING: checksums differ — accessor results disagree!\n"; + std::cerr << " WARNING: stencil/legacy checksums differ — accessor results disagree!\n"; + if (legacyChecksum != legacyXposedChecksum) + std::cerr << " WARNING: legacy/legacy-transposed checksums differ — ordering bug!\n"; } // ============================================================ @@ -574,7 +653,7 @@ static void printUsage(const char* argv0) << " --grid= Select grid by name (default: first FloatGrid)\n" << " --pass= Run one perf pass:\n" << " all (default), verify, decode, stencil,\n" - << " framing, legacy\n" + << " framing, legacy, legacy-transposed\n" << " --threads= Limit TBB parallelism (0 = TBB default)\n" << " --skip-validation Skip the sidecar ordering sanity check\n"; } diff --git a/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp b/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp index 37bcbc642b..94ab7e498a 100644 --- a/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp +++ b/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp @@ -262,7 +262,8 @@ static void runPerf( const std::string& passFilter = "all") { // wantPass() returns true if this pass should run under the current filter. - // Supported names: "decode", "stencil", "framing", "legacy". "all" runs everything. + // Supported names: "decode", "stencil", "framing", "legacy", + // "legacy-transposed". "all" runs everything. auto wantPass = [&](const char* name) { return passFilter == "all" || passFilter == name; }; @@ -445,6 +446,79 @@ static void runPerf( [](uint64_t a, uint64_t b) { return a ^ b; }); } // end wantPass("legacy") + // ---- Legacy transposed: tap-outer, voxel-inner ---- + // Same semantics as `legacy`, reordered. For each of the 18 WENO5 taps, + // sweep all BlockWidth voxels — giving long runs of probeLeaf + getValue + // calls with the SAME compile-time tap offset but varying center voxels. + double legacyXposedUs = 0.0; + uint64_t legacyXposedChecksum = 0; + if (wantPass("legacy-transposed")) { + std::fill(sums.begin(), sums.end(), uint64_t(0)); + + using Weno5Taps = nanovdb::Weno5Stencil::Taps; + static constexpr int SIZE = int(std::tuple_size_v); + + legacyXposedUs = timeForEach([&] { + nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), + [&](const nanovdb::util::Range1D& range) { + alignas(64) uint32_t leafIndex[BlockWidth]; + alignas(64) uint16_t voxelOffset[BlockWidth]; + alignas(64) nanovdb::Coord centers[BlockWidth]; + alignas(64) uint64_t s[BlockWidth]; + nanovdb::ReadAccessor acc(grid->tree().root()); + uint64_t* bs0 = sums.data(); + + for (size_t bID = range.begin(); bID != range.end(); ++bID) { + CPUVBM::decodeInverseMaps( + grid, firstLeafID[bID], + &jumpMap[bID * CPUVBM::JumpMapLength], + firstOffset + bID * BlockWidth, + leafIndex, voxelOffset); + + for (int i = 0; i < BlockWidth; ++i) { + s[i] = 0; + if (leafIndex[i] == CPUVBM::UnusedLeafIndex) continue; + const uint16_t vo = voxelOffset[i]; + const uint32_t li = leafIndex[i]; + const nanovdb::Coord cOrigin = firstLeaf[li].origin(); + centers[i] = cOrigin + nanovdb::Coord( + (vo >> 6) & 7, (vo >> 3) & 7, vo & 7); + } + + auto processTap = [&]() + [[gnu::always_inline]] + { + for (int i = 0; i < BlockWidth; ++i) { + if (leafIndex[i] == CPUVBM::UnusedLeafIndex) continue; + const nanovdb::Coord c = centers[i] + + nanovdb::Coord(DI, DJ, DK); + const LeafT* leaf = acc.probeLeaf(c); + if (!leaf) continue; + const uint32_t offset = (uint32_t(c[0] & 7) << 6) + | (uint32_t(c[1] & 7) << 3) + | uint32_t(c[2] & 7); + s[i] += leaf->data()->getValue(offset); + } + }; + + [&](std::index_sequence) { + (processTap.template operator()< + std::tuple_element_t::di, + std::tuple_element_t::dj, + std::tuple_element_t::dk>(), ...); + }(std::make_index_sequence{}); + + uint64_t* bs = bs0 + bID * BlockWidth; + for (int i = 0; i < BlockWidth; ++i) bs[i] = s[i]; + } + }); + }); + + legacyXposedChecksum = + std::accumulate(sums.begin(), sums.end(), uint64_t(0), + [](uint64_t a, uint64_t b) { return a ^ b; }); + } // end wantPass("legacy-transposed") + std::printf("\nEnd-to-end stencil gather (%u blocks, %lu active voxels):\n", nBlocks, nVoxels); std::printf(" decodeInverseMaps only: %7.1f ms (%5.1f ns/voxel)\n", @@ -458,9 +532,14 @@ static void runPerf( std::printf(" LegacyStencilAccessor : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", legacyUs / 1e3, legacyUs * 1e3 / double(nVoxels), (legacyUs - decodeUs) / 1e3, legacyChecksum); + std::printf(" Legacy transposed : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", + legacyXposedUs / 1e3, legacyXposedUs * 1e3 / double(nVoxels), + (legacyXposedUs - decodeUs) / 1e3, legacyXposedChecksum); if (stencilChecksum != legacyChecksum) - std::cerr << " WARNING: checksums differ — accessor results disagree!\n"; + std::cerr << " WARNING: stencil/legacy checksums differ — accessor results disagree!\n"; + if (legacyChecksum != legacyXposedChecksum) + std::cerr << " WARNING: legacy/legacy-transposed checksums differ — ordering bug!\n"; } // ============================================================ diff --git a/nanovdb/nanovdb/util/BatchAccessor.md b/nanovdb/nanovdb/util/BatchAccessor.md index b7cb4a1408..281fa66251 100644 --- a/nanovdb/nanovdb/util/BatchAccessor.md +++ b/nanovdb/nanovdb/util/BatchAccessor.md @@ -1467,6 +1467,71 @@ follow-ons implied by this work but not pursued here: legacy compatibility shim (to be retired after a deprecation window) rather than a permanent toggle. +### 8l. Follow-up: tap-outer loop ordering in the Legacy path + +Tested whether flipping `legacy`'s loop nest to tap-outer, +voxel-inner helps on spatially-coherent workloads where many voxels in +a batch are likely to share the same `valueMask` word. Added a +`legacy-transposed` benchmark pass in both `ex_stencil_gather_cpu` and +`ex_narrowband_stencil_cpu`; checksums match `legacy` byte-for-byte on +both workloads. + +#### 8l.1 Inlining pitfall + +First attempt used a runtime-args inner lambda +`[&](int di, int dj, int dk) { for (int i = 0; i < 128; ++i) ... }` +invoked 18 times via a parameter-pack fold. GCC refused to inline the +18 instantiations — the lambda body contains a 128-iteration loop with +`probeLeaf` + `getValue` inside, which blew past the per-caller inline +budget × 18. Result: 18 explicit `call` instructions to a 542-byte +`processTap` function with a 6-register prologue/epilogue per call, +and tap offsets `(di, dj, dk)` as runtime register arguments (one +spilled to stack) — so the compiler also couldn't specialise the loop +body per tap. That alone accounted for ~10 ms (~13 %) of the observed +slowdown vs. Legacy. + +Fix: templated lambda +`[&]() [[gnu::always_inline]] { ... }` +dispatched via `.template operator()()` inside the fold. +The standalone `processTap` symbol disappears; transposed body grows +from 4.4 KB → 9.8 KB (matching Legacy's 10.5 KB), and only cold-path +tree-walk helpers remain as call targets. + +#### 8l.2 Results + +Measured at ~32M active voxel scale on i9-285K (24 threads, no HT): + +| Workload | Legacy (voxel-outer) | Transposed (tap-outer) | Δ | +|----------|---------------------:|-----------------------:|--:| +| Narrowband taperLER.vdb | 2.2 ns/vox | 2.1 ns/vox | −3 to −6 % (within noise) | +| Synthetic 64M/50% | 2.4 ns/vox | 2.8 ns/vox | +19 % | + +The narrowband tap-outer edge is marginal and within the ~10 % +run-to-run noise floor observed on this host. Synthetic's tap-outer +slowdown is clearly outside noise. Not a consistent win. + +#### 8l.3 Implementation verdict: voxel-outer stays the default + +`LegacyStencilAccessor`'s voxel-outer `moveTo(center)` kept as the +production default: + +- **Clean abstraction**: `moveTo(center)` + indexed tap access maps + 1:1 to the stencil operator's mental model. A tap-outer batched + form would need external accumulator state and a centers-array + input, with no natural class boundary. +- **No scratch arrays**: voxel-outer keeps the per-voxel accumulator + in a register and the 18-tap buffer inside the accessor; tap-outer + needs stack-local `centers[128]` and `s[128]`. +- **Compiler robustness**: voxel-outer's 18-call single source + location is reliably collapsed by GCC. Tap-outer relies on an + explicit `[[gnu::always_inline]]` workaround that, if lost during + future refactors, would silently regress performance by ~13 %. + +`legacy-transposed` retained as a benchmark pass for reference and as +a datapoint reinforcing why the hybrid `StencilAccessor` is structured +tap-outer (SIMD direction-computation inherently amortises across +lanes at the same tap). + --- ## 9. Relationship to Phase 1 Prototype @@ -1559,6 +1624,16 @@ follow-ons implied by this work but not pursued here: Scope: benchmark-only; the library default is unchanged (right default for `probeValue`/`probeLeaf`/mixed workloads). +- **Tap-outer loop ordering evaluation in the Legacy path (§8l)**: + added `legacy-transposed` benchmark pass (checksums match byte-for-byte) + and tested on both workloads at matched ~32M-voxel scale. Narrowband: + marginal tap-outer edge, within noise. Synthetic: tap-outer ~19 % + slower. Uncovered a GCC inlining pitfall for runtime-args inner lambdas + (fixed via templated lambda + `[[gnu::always_inline]]`). + **Verdict**: voxel-outer `LegacyStencilAccessor` remains the default — + cleaner abstraction, no scratch arrays, no compiler-inlining fragility, + and no consistent perf advantage to tap-outer. + ### Remaining - **`[[gnu::always_inline]]` on `Simd.h` helpers** (§8f) vs From cc073c16207fb8df4a57529d090c499516799502 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Mon, 20 Apr 2026 12:10:05 -0500 Subject: [PATCH 42/60] =?UTF-8?q?Weno5Stencil:=20add=20center=20tap=20(19?= =?UTF-8?q?=20total);=20BatchAccessor:=20document=20=C2=A711=20WENO=20pipe?= =?UTF-8?q?line=20target?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Our Weno5Stencil in StencilAccessor.h was missing the center tap <0,0,0>, defining only the 18 off-center ±1,±2,±3 per-axis taps. The canonical WenoStencil in nanovdb/math/Stencils.h has 19 taps with WenoPt<0,0,0>::idx = 0, followed by x-axis (1..6), y-axis (7..12), z-axis (13..18). Fixed our ordering to match exactly, so any caller using the canonical WenoPt index convention gets the same slot layout as our Weno5Stencil::Taps. The Hull unchanged: the center never crosses a leaf, so it contributes nothing to the face-neighbour prefetch set (§4b monotonicity argument). Code impact is data-driven (SIZE = tuple_size_v): the StencilAccessor/LegacyStencilAccessor mIndices storage grows from [18][W] to [19][W], the parameter-pack fold in fillTaps now expands to 19 calls, and the tap-outer `legacy-transposed` pass iterates 19 inner loops. No hard-coded 18s in the benchmarks. Cross-path checksums still match byte-for-byte (hybrid vs legacy vs transposed), now with the center voxel's index included: - synthetic 64M/50%: 0x0000000001e1b860 (was 0x0000000000b2ce4fc) - narrowband taperLER: 0x000000001a38de2b (was 0x000000001f8ad71a) Rough 19/18 ratio shows up in the timings: narrowband narrowed ~2-5% across all three implementations (within the earlier noise floor); synthetic transposed ticked up more (~13%) but stays within the already-observed voxel-outer-vs-tap-outer spread. --- Also documented the end-to-end CPU WENO5 target pipeline in BatchAccessor.md §11. This captures the three-phase structure the experimentation in §5-§8 has been building toward: (1) Decode inverse maps (shipped) (2) Per-batch sidecar value assembly — the next phase: float mValues[Ntaps][W] via sidecar lookup, per W=4/8 batch (per-batch scope to keep mValues L1-resident, ~9.5 KB block-wide would be too big) (3) Full SIMD WENO over Simd Documented the sign-extrapolation convention for out-of-band taps (magnitude = |sidecar[0]|, sign inherited from the next-inner tap on the same axis), its loop-order implications (tap-outer / axis-major / ascending-|Δ|), and two open questions (distance-1 sign source, cascade transitivity). Flagged that the §8l voxel-outer-vs-tap-outer measurements do not settle the Phase-2 loop order question: they were taken at BlockWidth=128 inner-loop size over uint64 indices; the real Phase-2 inner loop will be W=4 or W=8 over floats. Output-layout-match and sign-extrap dependency both push toward tap-outer at that scale, but a rerun at the real batch width is required before locking it in. Plus small 18→19 comment sweeps in the live source; historical measurement narratives in §8c-§8k left alone (they describe what was measured at 18 taps at the time and don't update automatically). Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Efty Sifakis --- .../narrowband_stencil_cpu.cpp | 4 +- .../stencil_gather_cpu.cpp | 4 +- nanovdb/nanovdb/util/BatchAccessor.h | 2 +- nanovdb/nanovdb/util/BatchAccessor.md | 157 +++++++++++++++++- nanovdb/nanovdb/util/StencilAccessor.h | 19 ++- nanovdb/nanovdb/util/StencilAccessor.md | 13 +- 6 files changed, 176 insertions(+), 23 deletions(-) diff --git a/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp b/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp index 688d10ae62..0b661c5caa 100644 --- a/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp +++ b/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp @@ -460,7 +460,7 @@ static void runPerf( // ---- Legacy framing floor: loop structure + decode, no accessor call ---- // Anti-DCE writes derive from Coord components. Subtracted from the legacy - // pass to expose the 18-tap cost proper. + // pass to expose the 19-tap cost proper. double framingUs = 0.0; if (wantPass("framing")) { std::fill(sums.begin(), sums.end(), uint64_t(0)); @@ -541,7 +541,7 @@ static void runPerf( } // end wantPass("legacy") // ---- Legacy transposed: tap-outer, voxel-inner ---- - // Same semantics as `legacy`, reordered. For each of the 18 WENO5 taps, + // Same semantics as `legacy`, reordered. For each of the 19 WENO5 taps, // sweep all BlockWidth voxels — giving long runs of probeLeaf + getValue // calls with the SAME compile-time tap offset but varying center voxels. double legacyXposedUs = 0.0; diff --git a/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp b/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp index 94ab7e498a..db3bb4c915 100644 --- a/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp +++ b/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp @@ -366,7 +366,7 @@ static void runPerf( // ---- Legacy framing floor: loop structure + decode, no accessor call ---- // Anti-DCE writes derive from Coord components. Subtracted from the legacy - // pass to expose the 18-tap cost proper. + // pass to expose the 19-tap cost proper. double framingUs = 0.0; if (wantPass("framing")) { std::fill(sums.begin(), sums.end(), uint64_t(0)); @@ -447,7 +447,7 @@ static void runPerf( } // end wantPass("legacy") // ---- Legacy transposed: tap-outer, voxel-inner ---- - // Same semantics as `legacy`, reordered. For each of the 18 WENO5 taps, + // Same semantics as `legacy`, reordered. For each of the 19 WENO5 taps, // sweep all BlockWidth voxels — giving long runs of probeLeaf + getValue // calls with the SAME compile-time tap offset but varying center voxels. double legacyXposedUs = 0.0; diff --git a/nanovdb/nanovdb/util/BatchAccessor.h b/nanovdb/nanovdb/util/BatchAccessor.h index 8a074c26cd..67d13a86ad 100644 --- a/nanovdb/nanovdb/util/BatchAccessor.h +++ b/nanovdb/nanovdb/util/BatchAccessor.h @@ -419,7 +419,7 @@ class BatchAccessor // cachedGetValueInLeaf -- benchmarking variant that forces all // taps to stay in the center leaf via mod-8 wrap. // - // Purpose: measure the hybrid pipeline's floor cost when all 18 taps + // Purpose: measure the hybrid pipeline's floor cost when all taps // access the SAME leaf, with distinct per-tap / per-lane positions (so // the compiler can't CSE across taps, and we still exercise different // mValueMask words and prefix-sum slots). The result is semantically diff --git a/nanovdb/nanovdb/util/BatchAccessor.md b/nanovdb/nanovdb/util/BatchAccessor.md index 281fa66251..d90cc6dc5f 100644 --- a/nanovdb/nanovdb/util/BatchAccessor.md +++ b/nanovdb/nanovdb/util/BatchAccessor.md @@ -1480,15 +1480,17 @@ both workloads. First attempt used a runtime-args inner lambda `[&](int di, int dj, int dk) { for (int i = 0; i < 128; ++i) ... }` -invoked 18 times via a parameter-pack fold. GCC refused to inline the -18 instantiations — the lambda body contains a 128-iteration loop with -`probeLeaf` + `getValue` inside, which blew past the per-caller inline -budget × 18. Result: 18 explicit `call` instructions to a 542-byte -`processTap` function with a 6-register prologue/epilogue per call, -and tap offsets `(di, dj, dk)` as runtime register arguments (one -spilled to stack) — so the compiler also couldn't specialise the loop -body per tap. That alone accounted for ~10 ms (~13 %) of the observed -slowdown vs. Legacy. +invoked N_taps times via a parameter-pack fold (18 at the time of the +experiment — pre-center-tap — which is when these numbers were +collected; the same issue and fix apply at 19). GCC refused to +inline the instantiations — the lambda body contains a 128-iteration +loop with `probeLeaf` + `getValue` inside, which blew past the +per-caller inline budget × N_taps. Result: explicit `call` +instructions to a 542-byte `processTap` function with a 6-register +prologue/epilogue per call, and tap offsets `(di, dj, dk)` as runtime +register arguments (one spilled to stack) — so the compiler also +couldn't specialise the loop body per tap. That alone accounted for +~10 ms (~13 %) of the observed slowdown vs. Legacy. Fix: templated lambda `[&]() [[gnu::always_inline]] { ... }` @@ -1661,3 +1663,140 @@ lanes at the same tap). - **C++20 structural `Coord`:** unify template and runtime interfaces via `cachedGetValue(result, vo, leafMask)`. + +--- + +## 11. Target pipeline: per-block CPU WENO5 with sidecar values + +The work documented in §5–§8 — VBM decode, `BatchAccessor`, +`StencilAccessor`, branchless `LeafData::getValue`, the voxel-outer vs +tap-outer evaluation (§8l), the 19-tap `Weno5Stencil` alignment with +canonical `WenoPt<>` ordering — are all in service of a single target +end-to-end pipeline. For each VBM block the CPU WENO5 pass runs three +phases: + +### 11.1 Phase structure + +**(1) Decode inverse maps** — produce `leafIndex[128]` and +`voxelOffset[128]` from the block's `firstLeafID`, `jumpMap`, and +`firstOffset`. Already shipped; see §2. + +**(2) Per-batch sidecar value assembly** — for each W-wide batch within +the block (W = SIMD float lane width, typically 4 or 8), produce a +dense 2D array `float mValues[Ntaps][W]` that packs every tap's float +value for every active lane in the batch. This is where +`StencilAccessor` (hybrid SIMD cache + scalar tail) or +`LegacyStencilAccessor` (scalar per-voxel) plugs in — but the *output* +shape changes from the current `uint64_t mIndices[Ntaps][W]` to a +float array obtained via sidecar lookup (plus sign-extrapolation for +off-band taps; see §11.2). Per-batch scope is deliberate: `mValues` +stays resident in registers / L1 for the duration of the batch's WENO +arithmetic; a block-wide buffer would be 19 × 128 × 4 B ≈ 9.5 KB and +would spill L1 prematurely. + +**(3) Full SIMD WENO** — consume `mValues[tap][lane]` as +`Simd` loads (one SIMD register per tap) and evaluate the +WENO5 reconstruction via the generic-T Simd backend. The existing +Phase-2 GPU draft and the Simd.h infrastructure provide the +arithmetic; this phase is essentially `nanovdb::math::WENO5<>` applied +across W voxels simultaneously. + +### 11.2 Sidecar value assembly semantics + +For a tap at position *p = center + Δ*, the sidecar lookup is: + +``` +idx = leafPtr->getValue(localOffset(p)) // uint64_t, branchless +if (idx != 0) { + mValues[tap][lane] = sidecar[idx] +} else { + // out-of-band: voxel p is outside the narrow band + mValues[tap][lane] = sign_of_next_inner_tap * |sidecar[0]| +} +``` + +The "next-inner tap" is the tap one step closer to the center along +the *same* axis. This preserves a single-signed distance-field +interpretation across the band boundary: out-of-band voxels are +treated as "still outside on the same side as the near side," with +magnitude set to the background `|sidecar[0]|`. + +| Outer tap | Sign donor (next-inner along same axis) | +|-----------|-----------------------------------------| +| `<+2, 0, 0>`, `<0,+2, 0>`, `<0, 0,+2>` | `<+1, 0, 0>`, `<0,+1, 0>`, `<0, 0,+1>` | +| `<+3, 0, 0>`, `<0,+3, 0>`, `<0, 0,+3>` | `<+2, 0, 0>`, `<0,+2, 0>`, `<0, 0,+2>` | +| `<-2, 0, 0>`, `<0,-2, 0>`, `<0, 0,-2>` | `<-1, 0, 0>`, `<0,-1, 0>`, `<0, 0,-1>` | +| `<-3, 0, 0>`, `<0,-3, 0>`, `<0, 0,-3>` | `<-2, 0, 0>`, `<0,-2, 0>`, `<0, 0,-2>` | +| `<±1,0,0>`, `<0,±1,0>`, `<0,0,±1>` | *see §11.4 below* | + +### 11.3 Loop-order implications + +Two forces align to favor a tap-outer, voxel-inner assembly loop in +Phase 2: + +**(a) Output shape matches consumer.** `mValues[tap][lane]` is the +natural layout for `Simd::load(mValues[k], element_aligned)`. +A tap-outer assembly fills this directly. A voxel-outer assembly +would need either a transpose at the end or strided SIMD loads in +Phase 3. + +**(b) Sign-extrapolation dependency is tap-local.** If taps are +filled in axis-major / ascending-|Δ| order (e.g. for the x-axis: +first `<+1,0,0>`, then `<+2,0,0>`, then `<+3,0,0>`; similarly for +`−1,−2,−3` and for the y and z axes), the inner tap's float value is +already resident when the outer tap's sign-extrap check fires. +Voxel-outer also works but repeats the sign check per voxel rather +than once per (axis, |Δ|) pair. + +The §8l measurements (voxel-outer modestly beats tap-outer at +BlockWidth=128 inner-loop size over uint64 indices) do *not* settle +the Phase-2 loop-order question: at W=4 or W=8 inner-loop size, the +compiler-amortisation advantage of voxel-outer shrinks drastically +(only 4–8 voxels per unroll), while the output-layout-match benefit of +tap-outer becomes dominant. Re-running the ordering comparison at +the real pipeline's batch width is a required step before the +implementation choice is locked in. + +### 11.4 Open questions (to resolve before implementation) + +**(a) Sign source for distance-1 taps.** `<±1,0,0>`, `<0,±1,0>`, +`<0,0,±1>` have no inner tap along their axis except the center +`<0,0,0>`. Two possible rules: + +- *Uniform rule:* distance-1 inherits sign from the center tap's + float value. Always safe; one extra sign-check per distance-1 + tap. +- *Invariant-based rule:* distance-1 neighbours of any active voxel + are guaranteed in-band, so the sign-extrap branch never fires for + |Δ|=1 taps. Requires confirmation against how narrow-band layers + are generated upstream (openvdb level-set builders). + +The uniform rule is the default unless the invariant can be +confirmed and codified. + +**(b) Cascade behavior.** If the inner tap's value is *itself* the +result of a prior sign-extrapolation, using its sign directly is +correct by transitivity: when taps are processed in ascending-|Δ| +order along each axis, the inner tap's resolved float already carries +the correct sign (real or extrapolated), so the rule is +self-consistent without special-casing. Worth capturing here +because it's the quiet invariant that keeps the algorithm simple. + +### 11.5 Deliverables (not yet shipped) + +Implementation items that follow directly from §11.1–§11.4: + +- **Sidecar-aware `moveTo` variant** on `StencilAccessor` (and a + parallel form on `LegacyStencilAccessor`): same straddling + SIMD + cache structure as today, but writes `float mValues[SIZE][W]` via + sidecar lookup instead of `uint64_t mIndices[SIZE][W]`. +- **Sign-extrapolation pass** — either fused into the scalar tail (per + §11.3b), or as a post-pass that walks taps in axis-major, + ascending-|Δ| order over the filled `mValues`. +- **Phase-3 WENO kernel** — `nanovdb::math::WENO5>` + driven by the 19-slot `mValues` array, following the existing + `WenoStencil::WENO5` arithmetic but with all reads from the + pre-assembled batch buffer. +- **Batch-width ordering benchmark** — rerun the legacy/transposed + comparison at W=4 and W=8 over floats (not uint64 indices) to lock + in the Phase-2 loop order. diff --git a/nanovdb/nanovdb/util/StencilAccessor.h b/nanovdb/nanovdb/util/StencilAccessor.h index 2689457ebe..7429a11f64 100644 --- a/nanovdb/nanovdb/util/StencilAccessor.h +++ b/nanovdb/nanovdb/util/StencilAccessor.h @@ -83,14 +83,22 @@ constexpr int findIndex(std::index_sequence) } // namespace detail // ============================================================================= -// Weno5Stencil — 18-tap axis-aligned WENO5 stencil, radius 3 +// Weno5Stencil — 19-tap axis-aligned WENO5 stencil, radius 3 // ============================================================================= /// Concrete StencilT for the WENO5 3D stencil. -/// Taps: 18 axis-aligned offsets in {±1,±2,±3} × {x,y,z}. -/// Hull: 6 extremal offsets that cover all 18 tap crossing directions. +/// Taps: 19 axis-aligned offsets — the center plus {±1,±2,±3} along each of x,y,z. +/// Hull: 6 extremal offsets that cover all 18 non-center tap crossing directions. +/// +/// Tap ordering matches WenoPt::idx in nanovdb/math/Stencils.h: +/// idx 0 : <0,0,0> +/// idx 1.. 6 : x-axis <-3,0,0> <-2,0,0> <-1,0,0> <+1,0,0> <+2,0,0> <+3,0,0> +/// idx 7..12 : y-axis <0,-3,0> <0,-2,0> <0,-1,0> <0,+1,0> <0,+2,0> <0,+3,0> +/// idx 13..18 : z-axis <0,0,-3> <0,0,-2> <0,0,-1> <0,0,+1> <0,0,+2> <0,0,+3> struct Weno5Stencil { using Taps = std::tuple< + // center + StencilPoint< 0, 0, 0>, // x-axis StencilPoint<-3, 0, 0>, StencilPoint<-2, 0, 0>, StencilPoint<-1, 0, 0>, StencilPoint<+1, 0, 0>, StencilPoint<+2, 0, 0>, StencilPoint<+3, 0, 0>, @@ -102,7 +110,8 @@ struct Weno5Stencil { StencilPoint< 0, 0,+1>, StencilPoint< 0, 0,+2>, StencilPoint< 0, 0,+3> >; // Hull = 6 extremal taps that collectively probe all reachable face-neighbor - // directions for any combination of voxel position and WENO5 tap. + // directions for any combination of voxel position and non-center WENO5 tap. + // The center tap never crosses a leaf, so it's absent here by design. // See StencilAccessor.md §4b for the monotonicity argument. using Hull = std::tuple< StencilPoint<-3, 0, 0>, StencilPoint<+3, 0, 0>, @@ -183,7 +192,7 @@ class StencilAccessor // // Active-lane semantics: a lane i is "active" iff // leafIndex[i] != UnusedLeafIndex - // Active lanes receive their 18 tap indices in mIndices[k][i]. + // Active lanes receive their 19 tap indices in mIndices[k][i]. // Inactive lanes are zeroed (NanoVDB background index). // // Caller pattern: diff --git a/nanovdb/nanovdb/util/StencilAccessor.md b/nanovdb/nanovdb/util/StencilAccessor.md index 059ad3ec2b..c228644314 100644 --- a/nanovdb/nanovdb/util/StencilAccessor.md +++ b/nanovdb/nanovdb/util/StencilAccessor.md @@ -127,18 +127,23 @@ that would need to reason about leaf size vs. tap radius. ### 4c. Sketch of `StencilT` concept ```cpp -// WENO5 3D stencil: 18 axis-aligned taps, radius 3, hull = 6 extremal taps +// WENO5 3D stencil: 19 taps (center + 6 per axis at ±1,±2,±3), radius 3, +// hull = 6 extremal taps. Tap ordering matches WenoPt::idx in +// nanovdb/math/Stencils.h, so slot k here corresponds to the same physical +// tap as any code using the canonical WenoStencil index convention. struct Weno5Stencil { - static constexpr int SIZE = 18; + static constexpr int SIZE = 19; // ordered tap list: output slot i ↔ taps[i] static constexpr nanovdb::Coord taps[SIZE] = { + { 0, 0, 0}, {-3,0,0}, {-2,0,0}, {-1,0,0}, {1,0,0}, {2,0,0}, {3,0,0}, {0,-3,0}, {0,-2,0}, {0,-1,0}, {0,1,0}, {0,2,0}, {0,3,0}, {0,0,-3}, {0,0,-2}, {0,0,-1}, {0,0,1}, {0,0,2}, {0,0,3}, }; - // prefetch hull: 6 extremal taps cover all 18 + // prefetch hull: 6 extremal taps cover all 18 non-center taps + // (center never crosses a leaf, so it's excluded from the hull). static constexpr int HULL_SIZE = 6; static constexpr nanovdb::Coord hull[HULL_SIZE] = { {-3,0,0}, {3,0,0}, @@ -224,7 +229,7 @@ than stale data. Active lanes are then written by the straddling loop via `where`-blend; in the straddling case the blend ensures majority-leaf results are not overwritten when minority-leaf lanes are processed. -**Stack footprint:** for WENO5, W=16: 18 × 16 × 8 bytes = **2.25 KB**. +**Stack footprint:** for WENO5, W=16: 19 × 16 × 8 bytes = **2.375 KB**. Acceptable for a stack-local object within a VBM block kernel; would need care if embedded in a larger persistent structure. From 5a92059686dc16f1a27875f5ad80a9e739e84316 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Mon, 20 Apr 2026 17:04:53 -0500 Subject: [PATCH 43/60] ex_*stencil_cpu: batch-by-batch loops in legacy/framing/transposed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Restructured the `legacy`, `framing`, and `legacy-transposed` passes in both examples so that after `decodeInverseMaps` the work runs as an outer `for batchStart in 0..BlockWidth step SIMDw` loop with an inner lane loop over SIMDw=16. The StencilAccessor pass was already batched (reference structure); this aligns every other pass with it. Notes: - LegacyStencilAccessor / ReadAccessor instances stay outside the batch loop (one per TBB task) — same amortisation as before. - `legacy-transposed`: `centers[]` and `s[]` scratch arrays shrunk from BlockWidth=128 to SIMDw=16; the templated `processTap` lambda's inner loop also runs over SIMDw lanes. This is Stage 1 of the Phase-2 pipeline preparation (BatchAccessor.md §11): Phase 2 will slot a sidecar-aware value assembly + SIMD WENO kernel into the same batch loop, so every path needs that shape in place first. Perf impact (24-thread, i9-285K): - StencilAccessor: unchanged (already batched). - Legacy: ~5 % slower on narrowband, noise-level on synthetic. - Legacy-transposed: ~6-7 % slower both workloads — the per-batch `processTap` lambda fires 8x instead of once per block. Checksums match byte-for-byte across all three passes on both workloads. Co-Authored-By: Claude Sonnet 4.6 --- .../narrowband_stencil_cpu.cpp | 122 ++++++++++-------- .../stencil_gather_cpu.cpp | 122 ++++++++++-------- 2 files changed, 132 insertions(+), 112 deletions(-) diff --git a/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp b/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp index 0b661c5caa..d812bec90d 100644 --- a/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp +++ b/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp @@ -479,18 +479,21 @@ static void runPerf( leafIndex, voxelOffset); uint64_t* bs = bs0 + bID * BlockWidth; - for (int i = 0; i < BlockWidth; ++i) { - if (leafIndex[i] == CPUVBM::UnusedLeafIndex) continue; - const uint16_t vo = voxelOffset[i]; - const uint32_t li = leafIndex[i]; - const nanovdb::Coord cOrigin = firstLeaf[li].origin(); - const int lx = (vo >> 6) & 7, ly = (vo >> 3) & 7, lz = vo & 7; - const nanovdb::Coord center = cOrigin + nanovdb::Coord(lx, ly, lz); - // 18 trivial "taps" — no accessor call; anti-DCE via Coord components. - uint64_t s = 0; - for (int k = 0; k < LegacyAccT::size(); ++k) - s += static_cast(center.x() + center.y() + center.z() + k); - bs[i] = s; + for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { + for (int i = 0; i < SIMDw; ++i) { + const int p = batchStart + i; + if (leafIndex[p] == CPUVBM::UnusedLeafIndex) continue; + const uint16_t vo = voxelOffset[p]; + const uint32_t li = leafIndex[p]; + const nanovdb::Coord cOrigin = firstLeaf[li].origin(); + const int lx = (vo >> 6) & 7, ly = (vo >> 3) & 7, lz = vo & 7; + const nanovdb::Coord center = cOrigin + nanovdb::Coord(lx, ly, lz); + // 18 trivial "taps" — no accessor call; anti-DCE via Coord components. + uint64_t s = 0; + for (int k = 0; k < LegacyAccT::size(); ++k) + s += static_cast(center.x() + center.y() + center.z() + k); + bs[p] = s; + } } } }); @@ -520,16 +523,19 @@ static void runPerf( uint64_t* bs = bs0 + bID * BlockWidth; - for (int i = 0; i < BlockWidth; ++i) { - if (leafIndex[i] == CPUVBM::UnusedLeafIndex) continue; - const uint16_t vo = voxelOffset[i]; - const uint32_t li = leafIndex[i]; - const nanovdb::Coord cOrigin = firstLeaf[li].origin(); - const int lx = (vo >> 6) & 7, ly = (vo >> 3) & 7, lz = vo & 7; - legacyAcc.moveTo(cOrigin + nanovdb::Coord(lx, ly, lz)); - uint64_t s = 0; - for (int k = 0; k < LegacyAccT::size(); ++k) s += legacyAcc[k]; - bs[i] = s; + for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { + for (int i = 0; i < SIMDw; ++i) { + const int p = batchStart + i; + if (leafIndex[p] == CPUVBM::UnusedLeafIndex) continue; + const uint16_t vo = voxelOffset[p]; + const uint32_t li = leafIndex[p]; + const nanovdb::Coord cOrigin = firstLeaf[li].origin(); + const int lx = (vo >> 6) & 7, ly = (vo >> 3) & 7, lz = vo & 7; + legacyAcc.moveTo(cOrigin + nanovdb::Coord(lx, ly, lz)); + uint64_t s = 0; + for (int k = 0; k < LegacyAccT::size(); ++k) s += legacyAcc[k]; + bs[p] = s; + } } } }); @@ -557,8 +563,8 @@ static void runPerf( [&](const nanovdb::util::Range1D& range) { alignas(64) uint32_t leafIndex[BlockWidth]; alignas(64) uint16_t voxelOffset[BlockWidth]; - alignas(64) nanovdb::Coord centers[BlockWidth]; - alignas(64) uint64_t s[BlockWidth]; + alignas(64) nanovdb::Coord centers[SIMDw]; + alignas(64) uint64_t s[SIMDw]; nanovdb::ReadAccessor acc(grid->tree().root()); uint64_t* bs0 = sums.data(); @@ -569,41 +575,45 @@ static void runPerf( firstOffset + bID * BlockWidth, leafIndex, voxelOffset); - for (int i = 0; i < BlockWidth; ++i) { - s[i] = 0; - if (leafIndex[i] == CPUVBM::UnusedLeafIndex) continue; - const uint16_t vo = voxelOffset[i]; - const uint32_t li = leafIndex[i]; - const nanovdb::Coord cOrigin = firstLeaf[li].origin(); - centers[i] = cOrigin + nanovdb::Coord( - (vo >> 6) & 7, (vo >> 3) & 7, vo & 7); - } + uint64_t* bs = bs0 + bID * BlockWidth; - auto processTap = [&]() - [[gnu::always_inline]] - { - for (int i = 0; i < BlockWidth; ++i) { - if (leafIndex[i] == CPUVBM::UnusedLeafIndex) continue; - const nanovdb::Coord c = centers[i] - + nanovdb::Coord(DI, DJ, DK); - const LeafT* leaf = acc.probeLeaf(c); - if (!leaf) continue; - const uint32_t offset = (uint32_t(c[0] & 7) << 6) - | (uint32_t(c[1] & 7) << 3) - | uint32_t(c[2] & 7); - s[i] += leaf->data()->getValue(offset); + for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { + for (int i = 0; i < SIMDw; ++i) { + s[i] = 0; + const int p = batchStart + i; + if (leafIndex[p] == CPUVBM::UnusedLeafIndex) continue; + const uint16_t vo = voxelOffset[p]; + const uint32_t li = leafIndex[p]; + const nanovdb::Coord cOrigin = firstLeaf[li].origin(); + centers[i] = cOrigin + nanovdb::Coord( + (vo >> 6) & 7, (vo >> 3) & 7, vo & 7); } - }; - [&](std::index_sequence) { - (processTap.template operator()< - std::tuple_element_t::di, - std::tuple_element_t::dj, - std::tuple_element_t::dk>(), ...); - }(std::make_index_sequence{}); - - uint64_t* bs = bs0 + bID * BlockWidth; - for (int i = 0; i < BlockWidth; ++i) bs[i] = s[i]; + auto processTap = [&]() + [[gnu::always_inline]] + { + for (int i = 0; i < SIMDw; ++i) { + if (leafIndex[batchStart + i] == CPUVBM::UnusedLeafIndex) continue; + const nanovdb::Coord c = centers[i] + + nanovdb::Coord(DI, DJ, DK); + const LeafT* leaf = acc.probeLeaf(c); + if (!leaf) continue; + const uint32_t offset = (uint32_t(c[0] & 7) << 6) + | (uint32_t(c[1] & 7) << 3) + | uint32_t(c[2] & 7); + s[i] += leaf->data()->getValue(offset); + } + }; + + [&](std::index_sequence) { + (processTap.template operator()< + std::tuple_element_t::di, + std::tuple_element_t::dj, + std::tuple_element_t::dk>(), ...); + }(std::make_index_sequence{}); + + for (int i = 0; i < SIMDw; ++i) bs[batchStart + i] = s[i]; + } } }); }); diff --git a/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp b/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp index db3bb4c915..5adcb2de12 100644 --- a/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp +++ b/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp @@ -385,18 +385,21 @@ static void runPerf( leafIndex, voxelOffset); uint64_t* bs = bs0 + bID * BlockWidth; - for (int i = 0; i < BlockWidth; ++i) { - if (leafIndex[i] == CPUVBM::UnusedLeafIndex) continue; - const uint16_t vo = voxelOffset[i]; - const uint32_t li = leafIndex[i]; - const nanovdb::Coord cOrigin = firstLeaf[li].origin(); - const int lx = (vo >> 6) & 7, ly = (vo >> 3) & 7, lz = vo & 7; - const nanovdb::Coord center = cOrigin + nanovdb::Coord(lx, ly, lz); - // 18 trivial "taps" — no accessor call; anti-DCE via Coord components. - uint64_t s = 0; - for (int k = 0; k < LegacyAccT::size(); ++k) - s += static_cast(center.x() + center.y() + center.z() + k); - bs[i] = s; + for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { + for (int i = 0; i < SIMDw; ++i) { + const int p = batchStart + i; + if (leafIndex[p] == CPUVBM::UnusedLeafIndex) continue; + const uint16_t vo = voxelOffset[p]; + const uint32_t li = leafIndex[p]; + const nanovdb::Coord cOrigin = firstLeaf[li].origin(); + const int lx = (vo >> 6) & 7, ly = (vo >> 3) & 7, lz = vo & 7; + const nanovdb::Coord center = cOrigin + nanovdb::Coord(lx, ly, lz); + // 18 trivial "taps" — no accessor call; anti-DCE via Coord components. + uint64_t s = 0; + for (int k = 0; k < LegacyAccT::size(); ++k) + s += static_cast(center.x() + center.y() + center.z() + k); + bs[p] = s; + } } } }); @@ -426,16 +429,19 @@ static void runPerf( uint64_t* bs = bs0 + bID * BlockWidth; - for (int i = 0; i < BlockWidth; ++i) { - if (leafIndex[i] == CPUVBM::UnusedLeafIndex) continue; - const uint16_t vo = voxelOffset[i]; - const uint32_t li = leafIndex[i]; - const nanovdb::Coord cOrigin = firstLeaf[li].origin(); - const int lx = (vo >> 6) & 7, ly = (vo >> 3) & 7, lz = vo & 7; - legacyAcc.moveTo(cOrigin + nanovdb::Coord(lx, ly, lz)); - uint64_t s = 0; - for (int k = 0; k < LegacyAccT::size(); ++k) s += legacyAcc[k]; - bs[i] = s; + for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { + for (int i = 0; i < SIMDw; ++i) { + const int p = batchStart + i; + if (leafIndex[p] == CPUVBM::UnusedLeafIndex) continue; + const uint16_t vo = voxelOffset[p]; + const uint32_t li = leafIndex[p]; + const nanovdb::Coord cOrigin = firstLeaf[li].origin(); + const int lx = (vo >> 6) & 7, ly = (vo >> 3) & 7, lz = vo & 7; + legacyAcc.moveTo(cOrigin + nanovdb::Coord(lx, ly, lz)); + uint64_t s = 0; + for (int k = 0; k < LegacyAccT::size(); ++k) s += legacyAcc[k]; + bs[p] = s; + } } } }); @@ -463,8 +469,8 @@ static void runPerf( [&](const nanovdb::util::Range1D& range) { alignas(64) uint32_t leafIndex[BlockWidth]; alignas(64) uint16_t voxelOffset[BlockWidth]; - alignas(64) nanovdb::Coord centers[BlockWidth]; - alignas(64) uint64_t s[BlockWidth]; + alignas(64) nanovdb::Coord centers[SIMDw]; + alignas(64) uint64_t s[SIMDw]; nanovdb::ReadAccessor acc(grid->tree().root()); uint64_t* bs0 = sums.data(); @@ -475,41 +481,45 @@ static void runPerf( firstOffset + bID * BlockWidth, leafIndex, voxelOffset); - for (int i = 0; i < BlockWidth; ++i) { - s[i] = 0; - if (leafIndex[i] == CPUVBM::UnusedLeafIndex) continue; - const uint16_t vo = voxelOffset[i]; - const uint32_t li = leafIndex[i]; - const nanovdb::Coord cOrigin = firstLeaf[li].origin(); - centers[i] = cOrigin + nanovdb::Coord( - (vo >> 6) & 7, (vo >> 3) & 7, vo & 7); - } + uint64_t* bs = bs0 + bID * BlockWidth; - auto processTap = [&]() - [[gnu::always_inline]] - { - for (int i = 0; i < BlockWidth; ++i) { - if (leafIndex[i] == CPUVBM::UnusedLeafIndex) continue; - const nanovdb::Coord c = centers[i] - + nanovdb::Coord(DI, DJ, DK); - const LeafT* leaf = acc.probeLeaf(c); - if (!leaf) continue; - const uint32_t offset = (uint32_t(c[0] & 7) << 6) - | (uint32_t(c[1] & 7) << 3) - | uint32_t(c[2] & 7); - s[i] += leaf->data()->getValue(offset); + for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { + for (int i = 0; i < SIMDw; ++i) { + s[i] = 0; + const int p = batchStart + i; + if (leafIndex[p] == CPUVBM::UnusedLeafIndex) continue; + const uint16_t vo = voxelOffset[p]; + const uint32_t li = leafIndex[p]; + const nanovdb::Coord cOrigin = firstLeaf[li].origin(); + centers[i] = cOrigin + nanovdb::Coord( + (vo >> 6) & 7, (vo >> 3) & 7, vo & 7); } - }; - [&](std::index_sequence) { - (processTap.template operator()< - std::tuple_element_t::di, - std::tuple_element_t::dj, - std::tuple_element_t::dk>(), ...); - }(std::make_index_sequence{}); - - uint64_t* bs = bs0 + bID * BlockWidth; - for (int i = 0; i < BlockWidth; ++i) bs[i] = s[i]; + auto processTap = [&]() + [[gnu::always_inline]] + { + for (int i = 0; i < SIMDw; ++i) { + if (leafIndex[batchStart + i] == CPUVBM::UnusedLeafIndex) continue; + const nanovdb::Coord c = centers[i] + + nanovdb::Coord(DI, DJ, DK); + const LeafT* leaf = acc.probeLeaf(c); + if (!leaf) continue; + const uint32_t offset = (uint32_t(c[0] & 7) << 6) + | (uint32_t(c[1] & 7) << 3) + | uint32_t(c[2] & 7); + s[i] += leaf->data()->getValue(offset); + } + }; + + [&](std::index_sequence) { + (processTap.template operator()< + std::tuple_element_t::di, + std::tuple_element_t::dj, + std::tuple_element_t::dk>(), ...); + }(std::make_index_sequence{}); + + for (int i = 0; i < SIMDw; ++i) bs[batchStart + i] = s[i]; + } } }); }); From 110d852c55a6cd6f5a9b3ad93528de68ac0df020 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Mon, 20 Apr 2026 17:43:42 -0500 Subject: [PATCH 44/60] ex_narrowband_stencil_cpu: sidecar-legacy/stencil/transposed passes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stage 2 of the Phase-2 pipeline preparation (BatchAccessor.md §11). Three new benchmark passes that assemble per-batch value + activity matrices from the sidecar, plus a stand-in token op for anti-DCE: float values [SIZE][SIMDw] -- sidecar[idx] (idx==0 -> background) bool isActive[SIZE][SIMDw] -- (idx != 0) Token op: per active voxel, sum values[k][i] over taps with isActive[k][i]==true, write to a second sidecar at the voxel's VBM-sequential index (firstOffset + bID*BlockWidth + lane), which equals the center voxel's ValueOnIndex by VBM construction. Supporting changes: - convertToIndexGridWithSidecar: set sidecar[0] = floatGrid.background() after copyValues, so sidecar[idx] is unconditionally valid. - runPerf: takes `const std::vector& sidecar`; allocates a local outputSidecar (same shape as input) for the new passes. Pass variants: - sidecar-legacy: LegacyStencilAccessor (scalar moveTo per voxel) - sidecar-stencil: StencilAccessor (hybrid SIMD+scalar moveTo per batch, reads mIndices[k][i] directly) - sidecar-transposed: tap-outer ReadAccessor probeLeaf + getValue All three produce identical checksums (cross-validation): - checksum 0xcfbff7c8 on taperLER.vdb End-to-end timings (24 threads, i9-285K): sidecar-legacy: 130 ms (4.1 ns/voxel) sidecar-stencil: 97 ms (3.0 ns/voxel) -- best sidecar-transposed: 126 ms (4.0 ns/voxel) StencilAccessor's SIMD moveTo amortises both the direction-decode and the per-lane scalar work; its tap-outer contiguous mIndices[k][i] rows also let the sidecar gather and value-array fill vectorise cleanly. Co-Authored-By: Claude Sonnet 4.6 --- .../narrowband_stencil_cpu.cpp | 301 +++++++++++++++++- 1 file changed, 296 insertions(+), 5 deletions(-) diff --git a/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp b/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp index d812bec90d..c060162b62 100644 --- a/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp +++ b/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp @@ -58,6 +58,7 @@ #include #include #include +#include // std::memcpy (sidecar-pass checksum) #include #include // std::unique_ptr #include @@ -146,6 +147,12 @@ convertToIndexGridWithSidecar(openvdb::FloatGrid& floatGrid) // valueCount() is only valid after getHandle with an index DstBuildT. p.sidecar.resize(builder.valueCount()); builder.template copyValues(p.sidecar.data()); + + // NanoVDB convention: index 0 of the sidecar holds the background value. + // copyValues does not write slot 0 (active voxel indices start at 1); + // set it explicitly so downstream code can treat sidecar[idx] as valid + // for both in-band (idx>0) and out-of-band (idx==0) taps without branching. + if (!p.sidecar.empty()) p.sidecar[0] = floatGrid.background(); return p; } @@ -353,10 +360,12 @@ static void runPrototype( static void runPerf( const GridT* grid, const nanovdb::tools::VoxelBlockManagerHandle& vbmHandle, + const std::vector& sidecar, const std::string& passFilter = "all") { // wantPass() returns true if this pass should run under the current filter. // Supported names: "decode", "stencil", "framing", "legacy", + // "sidecar-legacy", "sidecar-stencil", "sidecar-transposed", // "legacy-transposed". "all" runs everything. auto wantPass = [&](const char* name) { return passFilter == "all" || passFilter == name; @@ -373,6 +382,12 @@ static void runPerf( // range (bID * BlockWidth ... + BlockWidth - 1) — no synchronisation needed. std::vector sums((size_t)nBlocks * BlockWidth, 0); + // Second sidecar for the `sidecar` pass: written at each voxel's + // VBM-sequential index (firstOffset + bID*BlockWidth + lane), which by + // construction equals the center voxel's ValueOnIndex. Sized to match + // the input sidecar so we can reuse its indexing. + std::vector outputSidecar(sidecar.size(), 0.f); + std::ostringstream sink; // absorbs Timer's warm-pass "... " output nanovdb::util::Timer timer; @@ -546,6 +561,271 @@ static void runPerf( [](uint64_t a, uint64_t b) { return a ^ b; }); } // end wantPass("legacy") + // ---- sidecar-legacy: float value + bool isActive matrices via LegacyStencilAccessor ---- + // Precursor to the full WENO5 pipeline (§11 of BatchAccessor.md). Within + // each SIMDw-lane batch, assembles two per-tap arrays: + // float values[SIZE][SIMDw] -- sidecar[idx] (idx==0 -> background) + // bool isActive[SIZE][SIMDw] -- (idx != 0) + // Token op (anti-DCE, stand-in for WENO arithmetic): per active voxel, + // sum values[k][i] over taps with isActive[k][i]==true, write the result + // to outputSidecar at the voxel's VBM-sequential index. + double sidecarLegacyUs = 0.0; + uint64_t sidecarLegacyChecksum = 0; + if (wantPass("sidecar-legacy")) { + std::fill(outputSidecar.begin(), outputSidecar.end(), 0.f); + + sidecarLegacyUs = timeForEach([&] { + nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), + [&](const nanovdb::util::Range1D& range) { + alignas(64) uint32_t leafIndex[BlockWidth]; + alignas(64) uint16_t voxelOffset[BlockWidth]; + LegacyAccT legacyAcc(*grid); + + constexpr int SIZE = LegacyAccT::size(); + alignas(64) float values [SIZE][SIMDw]; + alignas(64) bool isActive[SIZE][SIMDw]; + + const float* const scIn = sidecar.data(); + float* const scOut = outputSidecar.data(); + + for (size_t bID = range.begin(); bID != range.end(); ++bID) { + CPUVBM::decodeInverseMaps( + grid, firstLeafID[bID], + &jumpMap[bID * CPUVBM::JumpMapLength], + firstOffset + bID * BlockWidth, + leafIndex, voxelOffset); + + const uint64_t blockBase = + firstOffset + (uint64_t)bID * BlockWidth; + + for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { + // Fill values[][] and isActive[][] for this batch. + for (int i = 0; i < SIMDw; ++i) { + const int p = batchStart + i; + if (leafIndex[p] == CPUVBM::UnusedLeafIndex) { + for (int k = 0; k < SIZE; ++k) { + values[k][i] = scIn[0]; + isActive[k][i] = false; + } + continue; + } + const uint16_t vo = voxelOffset[p]; + const uint32_t li = leafIndex[p]; + const nanovdb::Coord cOrigin = firstLeaf[li].origin(); + const int lx = (vo >> 6) & 7, ly = (vo >> 3) & 7, lz = vo & 7; + legacyAcc.moveTo(cOrigin + nanovdb::Coord(lx, ly, lz)); + for (int k = 0; k < SIZE; ++k) { + const uint64_t idx = legacyAcc[k]; + values[k][i] = scIn[idx]; // scIn[0] == background + isActive[k][i] = (idx != 0); + } + } + + // Token op: sum values for Active taps per voxel. + for (int i = 0; i < SIMDw; ++i) { + const int p = batchStart + i; + if (leafIndex[p] == CPUVBM::UnusedLeafIndex) continue; + float sum = 0.f; + for (int k = 0; k < SIZE; ++k) + if (isActive[k][i]) sum += values[k][i]; + scOut[blockBase + p] = sum; + } + } + } + }); + }); + + // Anti-DCE checksum: XOR of the float bit patterns across the full + // output sidecar. Zero-initialised slots contribute 0 (XOR identity), + // so inactive voxels don't disturb the result. + sidecarLegacyChecksum = + std::accumulate(outputSidecar.begin(), outputSidecar.end(), uint64_t(0), + [](uint64_t a, float b) { + uint32_t bits; + std::memcpy(&bits, &b, sizeof(bits)); + return a ^ uint64_t(bits); + }); + } // end wantPass("sidecar-legacy") + + // ---- sidecar-stencil: same matrices via StencilAccessor (hybrid SIMD+scalar) ---- + // Uses StencilAccessor's mIndices[SIZE][SIMDw] — the result of its SIMD + // direction-decode + scalar leaf.getValue() tail — directly as the + // uint64 index source for the sidecar lookup. Inactive lanes have + // mIndices[k][i]=0 naturally (StencilAccessor zero-fills), so the fill + // loop has no per-lane UnusedLeafIndex guard. + double sidecarStencilUs = 0.0; + uint64_t sidecarStencilChecksum = 0; + if (wantPass("sidecar-stencil")) { + std::fill(outputSidecar.begin(), outputSidecar.end(), 0.f); + + sidecarStencilUs = timeForEach([&] { + nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), + [&](const nanovdb::util::Range1D& range) { + alignas(64) uint32_t leafIndex[BlockWidth]; + alignas(64) uint16_t voxelOffset[BlockWidth]; + + constexpr int SIZE = SAccT::size(); + alignas(64) float values [SIZE][SIMDw]; + alignas(64) bool isActive[SIZE][SIMDw]; + + const float* const scIn = sidecar.data(); + float* const scOut = outputSidecar.data(); + + for (size_t bID = range.begin(); bID != range.end(); ++bID) { + CPUVBM::decodeInverseMaps( + grid, firstLeafID[bID], + &jumpMap[bID * CPUVBM::JumpMapLength], + firstOffset + bID * BlockWidth, + leafIndex, voxelOffset); + + int nExtraLeaves = 0; + for (int w = 0; w < CPUVBM::JumpMapLength; ++w) + nExtraLeaves += nanovdb::util::countOn( + jumpMap[bID * CPUVBM::JumpMapLength + w]); + + SAccT stencilAcc(*grid, firstLeafID[bID], (uint32_t)nExtraLeaves); + const uint64_t blockBase = + firstOffset + (uint64_t)bID * BlockWidth; + + for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { + stencilAcc.moveTo(leafIndex + batchStart, voxelOffset + batchStart); + + // Tap-outer fill: StencilAccessor stores mIndices[tap][lane] + // contiguously along the lane axis, so iterating k-outer + // turns lane-inner into a 16-wide sweep over one row. + for (int k = 0; k < SIZE; ++k) { + for (int i = 0; i < SIMDw; ++i) { + const uint64_t idx = stencilAcc.mIndices[k][i]; + values[k][i] = scIn[idx]; // scIn[0] == background + isActive[k][i] = (idx != 0); + } + } + + for (int i = 0; i < SIMDw; ++i) { + const int p = batchStart + i; + if (leafIndex[p] == CPUVBM::UnusedLeafIndex) continue; + float sum = 0.f; + for (int k = 0; k < SIZE; ++k) + if (isActive[k][i]) sum += values[k][i]; + scOut[blockBase + p] = sum; + } + } + } + }); + }); + + sidecarStencilChecksum = + std::accumulate(outputSidecar.begin(), outputSidecar.end(), uint64_t(0), + [](uint64_t a, float b) { + uint32_t bits; + std::memcpy(&bits, &b, sizeof(bits)); + return a ^ uint64_t(bits); + }); + } // end wantPass("sidecar-stencil") + + // ---- sidecar-transposed: tap-outer fill via direct ReadAccessor ---- + // Mirrors `legacy-transposed`'s loop structure, but instead of summing + // uint64 indices into a per-voxel accumulator, the tap-outer loop fills + // values[tap][lane] + isActive[tap][lane]. A second voxel-outer pass + // performs the same token sum as the other variants. + double sidecarXposedUs = 0.0; + uint64_t sidecarXposedChecksum = 0; + if (wantPass("sidecar-transposed")) { + std::fill(outputSidecar.begin(), outputSidecar.end(), 0.f); + + using Weno5TapsX = nanovdb::Weno5Stencil::Taps; + static constexpr int SIZEX = int(std::tuple_size_v); + + sidecarXposedUs = timeForEach([&] { + nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), + [&](const nanovdb::util::Range1D& range) { + alignas(64) uint32_t leafIndex[BlockWidth]; + alignas(64) uint16_t voxelOffset[BlockWidth]; + alignas(64) nanovdb::Coord centers[SIMDw]; + alignas(64) float values [SIZEX][SIMDw]; + alignas(64) bool isActive[SIZEX][SIMDw]; + nanovdb::ReadAccessor acc(grid->tree().root()); + + const float* const scIn = sidecar.data(); + float* const scOut = outputSidecar.data(); + + for (size_t bID = range.begin(); bID != range.end(); ++bID) { + CPUVBM::decodeInverseMaps( + grid, firstLeafID[bID], + &jumpMap[bID * CPUVBM::JumpMapLength], + firstOffset + bID * BlockWidth, + leafIndex, voxelOffset); + + const uint64_t blockBase = + firstOffset + (uint64_t)bID * BlockWidth; + + for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { + for (int i = 0; i < SIMDw; ++i) { + const int p = batchStart + i; + if (leafIndex[p] == CPUVBM::UnusedLeafIndex) continue; + const uint16_t vo = voxelOffset[p]; + const uint32_t li = leafIndex[p]; + const nanovdb::Coord cOrigin = firstLeaf[li].origin(); + centers[i] = cOrigin + nanovdb::Coord( + (vo >> 6) & 7, (vo >> 3) & 7, vo & 7); + } + + auto processTap = [&]() + [[gnu::always_inline]] + { + for (int i = 0; i < SIMDw; ++i) { + if (leafIndex[batchStart + i] == CPUVBM::UnusedLeafIndex) { + values [K][i] = scIn[0]; + isActive[K][i] = false; + continue; + } + const nanovdb::Coord c = centers[i] + + nanovdb::Coord(DI, DJ, DK); + const LeafT* leaf = acc.probeLeaf(c); + if (!leaf) { + values [K][i] = scIn[0]; + isActive[K][i] = false; + continue; + } + const uint32_t offset = (uint32_t(c[0] & 7) << 6) + | (uint32_t(c[1] & 7) << 3) + | uint32_t(c[2] & 7); + const uint64_t idx = leaf->data()->getValue(offset); + values [K][i] = scIn[idx]; + isActive[K][i] = (idx != 0); + } + }; + + [&](std::index_sequence) { + (processTap.template operator()< + int(Is), + std::tuple_element_t::di, + std::tuple_element_t::dj, + std::tuple_element_t::dk>(), ...); + }(std::make_index_sequence{}); + + for (int i = 0; i < SIMDw; ++i) { + const int p = batchStart + i; + if (leafIndex[p] == CPUVBM::UnusedLeafIndex) continue; + float sum = 0.f; + for (int k = 0; k < SIZEX; ++k) + if (isActive[k][i]) sum += values[k][i]; + scOut[blockBase + p] = sum; + } + } + } + }); + }); + + sidecarXposedChecksum = + std::accumulate(outputSidecar.begin(), outputSidecar.end(), uint64_t(0), + [](uint64_t a, float b) { + uint32_t bits; + std::memcpy(&bits, &b, sizeof(bits)); + return a ^ uint64_t(bits); + }); + } // end wantPass("sidecar-transposed") + // ---- Legacy transposed: tap-outer, voxel-inner ---- // Same semantics as `legacy`, reordered. For each of the 19 WENO5 taps, // sweep all BlockWidth voxels — giving long runs of probeLeaf + getValue @@ -636,6 +916,15 @@ static void runPerf( std::printf(" LegacyStencilAccessor : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", legacyUs / 1e3, legacyUs * 1e3 / double(nVoxels), (legacyUs - decodeUs) / 1e3, legacyChecksum); + std::printf(" Sidecar (legacy) : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", + sidecarLegacyUs / 1e3, sidecarLegacyUs * 1e3 / double(nVoxels), + (sidecarLegacyUs - decodeUs) / 1e3, sidecarLegacyChecksum); + std::printf(" Sidecar (stencil) : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", + sidecarStencilUs / 1e3, sidecarStencilUs * 1e3 / double(nVoxels), + (sidecarStencilUs - decodeUs) / 1e3, sidecarStencilChecksum); + std::printf(" Sidecar (transposed) : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", + sidecarXposedUs / 1e3, sidecarXposedUs * 1e3 / double(nVoxels), + (sidecarXposedUs - decodeUs) / 1e3, sidecarXposedChecksum); std::printf(" Legacy transposed : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", legacyXposedUs / 1e3, legacyXposedUs * 1e3 / double(nVoxels), (legacyXposedUs - decodeUs) / 1e3, legacyXposedChecksum); @@ -644,6 +933,10 @@ static void runPerf( std::cerr << " WARNING: stencil/legacy checksums differ — accessor results disagree!\n"; if (legacyChecksum != legacyXposedChecksum) std::cerr << " WARNING: legacy/legacy-transposed checksums differ — ordering bug!\n"; + if (sidecarLegacyChecksum != sidecarStencilChecksum) + std::cerr << " WARNING: sidecar legacy/stencil checksums differ — accessor results disagree!\n"; + if (sidecarLegacyChecksum != sidecarXposedChecksum) + std::cerr << " WARNING: sidecar legacy/transposed checksums differ — ordering bug!\n"; } // ============================================================ @@ -663,7 +956,8 @@ static void printUsage(const char* argv0) << " --grid= Select grid by name (default: first FloatGrid)\n" << " --pass= Run one perf pass:\n" << " all (default), verify, decode, stencil,\n" - << " framing, legacy, legacy-transposed\n" + << " framing, legacy, legacy-transposed,\n" + << " sidecar-legacy, sidecar-stencil, sidecar-transposed\n" << " --threads= Limit TBB parallelism (0 = TBB default)\n" << " --skip-validation Skip the sidecar ordering sanity check\n"; } @@ -748,10 +1042,7 @@ int main(int argc, char** argv) if (passFilter == "all" || passFilter == "verify") runPrototype(grid, vbmHandle); - runPerf(grid, vbmHandle, passFilter); - - // Silence unused-variable warning for sidecar until a future pass uses it. - (void)payload.sidecar; + runPerf(grid, vbmHandle, payload.sidecar, passFilter); } catch (const std::exception& e) { std::cerr << "Exception: " << e.what() << "\n"; return 1; From a6b08712e4c6f80ddedb677b7c32043ad1d35b9a Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Mon, 20 Apr 2026 17:54:46 -0500 Subject: [PATCH 45/60] WenoStencil: single-source scalar/SIMD stencil + out-of-band extrapolation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New header nanovdb/nanovdb/util/WenoStencil.h defines WenoStencil, a 19-tap WENO5 stencil value container parameterised on SIMD lane width. Same storage declaration holds for both scalar (W=1, GPU) and SIMD (W>1, CPU batched) instantiations via std::conditional_t: W == 1 : ValueT = float PredT = bool W > 1 : ValueT = float[W] PredT = bool[W] Storage is plain C arrays (mValues[SIZE], mIsActive[SIZE]) so callers fill lane-by-lane with the same syntax in both cases; an internal addr() helper bridges the W=1 scalar-reference / W>1 array-decay asymmetry for SIMD load/store inside extrapolate(). extrapolate(absBackground) repairs out-of-band lanes in-place: for each tap k with mIsActive[k][i] == false, writes copysign(absBackground, mValues[innerTap][i]) via a hardcoded ascending-|Δ| cascade (18 pairs): |Δ|=1 taps <-- center (0,0,0) |Δ|=2 taps <-- |Δ|=1 on same axis |Δ|=3 taps <-- |Δ|=2 on same axis Cascade order guarantees the inner tap is already resolved when the outer tap is processed, so sign propagates through |Δ|=1 -> |Δ|=2 -> |Δ|=3 without special casing. Implementation uses only Simd.h primitives (Simd, SimdMask, where, operator>, unary minus, store) which collapse cleanly to scalar code under W=1 fixed_size<1>. No Simd.h changes needed. Also adds a new benchmark pass `sidecar-stencil-extrap` in ex_narrowband_stencil_cpu: same fill as sidecar-stencil, then calls stencil.extrapolate(|sidecar[0]|), token sum over ALL 19 taps (previously sum was gated by isActive). Results on taperLER.vdb (24 threads, i9-285K): Sidecar (stencil) : 97.3 ms (3.1 ns/voxel) checksum=0xcfbff7c8 Sidecar (stencil+extrap): 101.8 ms (3.2 ns/voxel) checksum=0x371273d0 Extrap cost: +4.5 ms / 31.8M voxels = ~0.14 ns/voxel (~50 cycles for 18 SIMD blend pairs). Checksum for the extrap pass stable run-to-run; differs from sidecar-stencil as expected since the sum now includes extrapolated values instead of excluding out-of-band lanes. Co-Authored-By: Claude Sonnet 4.6 --- .../narrowband_stencil_cpu.cpp | 90 ++++++++- nanovdb/nanovdb/util/WenoStencil.h | 172 ++++++++++++++++++ 2 files changed, 259 insertions(+), 3 deletions(-) create mode 100644 nanovdb/nanovdb/util/WenoStencil.h diff --git a/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp b/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp index c060162b62..927163e5be 100644 --- a/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp +++ b/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp @@ -48,6 +48,7 @@ #include #include #include +#include #include #include @@ -57,6 +58,7 @@ #include #include #include +#include // std::abs (sidecar-stencil-extrap) #include #include // std::memcpy (sidecar-pass checksum) #include @@ -365,8 +367,9 @@ static void runPerf( { // wantPass() returns true if this pass should run under the current filter. // Supported names: "decode", "stencil", "framing", "legacy", - // "sidecar-legacy", "sidecar-stencil", "sidecar-transposed", - // "legacy-transposed". "all" runs everything. + // "sidecar-legacy", "sidecar-stencil", "sidecar-stencil-extrap", + // "sidecar-transposed", "legacy-transposed". + // "all" runs everything. auto wantPass = [&](const char* name) { return passFilter == "all" || passFilter == name; }; @@ -723,6 +726,83 @@ static void runPerf( }); } // end wantPass("sidecar-stencil") + // ---- sidecar-stencil-extrap: sidecar-stencil + WenoStencil::extrapolate ---- + // Same fill as sidecar-stencil, then calls WenoStencil::extrapolate + // to repair out-of-band lanes via copysign(|background|, mValues[innerTap]). + // After extrapolation, isActive is not needed for the downstream op; + // the token sum over ALL taps (active + extrapolated) is the anti-DCE + // artifact. Checksum will differ from sidecar-stencil (which summed + // active-only) — that's the expected correctness signal. + double sidecarStencilExtrapUs = 0.0; + uint64_t sidecarStencilExtrapChecksum = 0; + if (wantPass("sidecar-stencil-extrap")) { + std::fill(outputSidecar.begin(), outputSidecar.end(), 0.f); + + const float absBackground = std::abs(sidecar[0]); // sidecar[0] = floatGrid.background() + + sidecarStencilExtrapUs = timeForEach([&] { + nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), + [&](const nanovdb::util::Range1D& range) { + alignas(64) uint32_t leafIndex[BlockWidth]; + alignas(64) uint16_t voxelOffset[BlockWidth]; + + nanovdb::WenoStencil stencil; + constexpr int SIZE = nanovdb::WenoStencil::size(); + + const float* const scIn = sidecar.data(); + float* const scOut = outputSidecar.data(); + + for (size_t bID = range.begin(); bID != range.end(); ++bID) { + CPUVBM::decodeInverseMaps( + grid, firstLeafID[bID], + &jumpMap[bID * CPUVBM::JumpMapLength], + firstOffset + bID * BlockWidth, + leafIndex, voxelOffset); + + int nExtraLeaves = 0; + for (int w = 0; w < CPUVBM::JumpMapLength; ++w) + nExtraLeaves += nanovdb::util::countOn( + jumpMap[bID * CPUVBM::JumpMapLength + w]); + + SAccT stencilAcc(*grid, firstLeafID[bID], (uint32_t)nExtraLeaves); + const uint64_t blockBase = + firstOffset + (uint64_t)bID * BlockWidth; + + for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { + stencilAcc.moveTo(leafIndex + batchStart, voxelOffset + batchStart); + + for (int k = 0; k < SIZE; ++k) { + for (int i = 0; i < SIMDw; ++i) { + const uint64_t idx = stencilAcc.mIndices[k][i]; + stencil.mValues [k][i] = scIn[idx]; + stencil.mIsActive[k][i] = (idx != 0); + } + } + + stencil.extrapolate(absBackground); + + for (int i = 0; i < SIMDw; ++i) { + const int p = batchStart + i; + if (leafIndex[p] == CPUVBM::UnusedLeafIndex) continue; + float sum = 0.f; + for (int k = 0; k < SIZE; ++k) + sum += stencil.mValues[k][i]; + scOut[blockBase + p] = sum; + } + } + } + }); + }); + + sidecarStencilExtrapChecksum = + std::accumulate(outputSidecar.begin(), outputSidecar.end(), uint64_t(0), + [](uint64_t a, float b) { + uint32_t bits; + std::memcpy(&bits, &b, sizeof(bits)); + return a ^ uint64_t(bits); + }); + } // end wantPass("sidecar-stencil-extrap") + // ---- sidecar-transposed: tap-outer fill via direct ReadAccessor ---- // Mirrors `legacy-transposed`'s loop structure, but instead of summing // uint64 indices into a per-voxel accumulator, the tap-outer loop fills @@ -922,6 +1002,9 @@ static void runPerf( std::printf(" Sidecar (stencil) : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", sidecarStencilUs / 1e3, sidecarStencilUs * 1e3 / double(nVoxels), (sidecarStencilUs - decodeUs) / 1e3, sidecarStencilChecksum); + std::printf(" Sidecar (stencil+extrap): %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", + sidecarStencilExtrapUs / 1e3, sidecarStencilExtrapUs * 1e3 / double(nVoxels), + (sidecarStencilExtrapUs - decodeUs) / 1e3, sidecarStencilExtrapChecksum); std::printf(" Sidecar (transposed) : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", sidecarXposedUs / 1e3, sidecarXposedUs * 1e3 / double(nVoxels), (sidecarXposedUs - decodeUs) / 1e3, sidecarXposedChecksum); @@ -957,7 +1040,8 @@ static void printUsage(const char* argv0) << " --pass= Run one perf pass:\n" << " all (default), verify, decode, stencil,\n" << " framing, legacy, legacy-transposed,\n" - << " sidecar-legacy, sidecar-stencil, sidecar-transposed\n" + << " sidecar-legacy, sidecar-stencil,\n" + << " sidecar-stencil-extrap, sidecar-transposed\n" << " --threads= Limit TBB parallelism (0 = TBB default)\n" << " --skip-validation Skip the sidecar ordering sanity check\n"; } diff --git a/nanovdb/nanovdb/util/WenoStencil.h b/nanovdb/nanovdb/util/WenoStencil.h new file mode 100644 index 0000000000..d5069d7411 --- /dev/null +++ b/nanovdb/nanovdb/util/WenoStencil.h @@ -0,0 +1,172 @@ +// Copyright Contributors to the OpenVDB Project +// SPDX-License-Identifier: Apache-2.0 + +/*! + \file WenoStencil.h + + \brief 19-tap WENO5 stencil data container + per-tap out-of-band + extrapolation, templated on SIMD lane width. + + `WenoStencil` holds the per-tap float values and per-tap activity + flags for a single voxel (W=1, scalar / GPU-friendly) or a batch of W + voxels (W>1, CPU SIMD). The underlying element types switch via + `std::conditional_t`: + + W == 1 : ValueT = float PredT = bool + W > 1 : ValueT = float[W] PredT = bool[W] + + Storage is a plain C array (`ValueT mValues[SIZE]`) so the caller can + fill it lane-by-lane with the same scalar syntax in both cases + (`s.mValues[k][i] = ...` at W>1; `s.mValues[k] = ...` at W=1). + + The class's one substantive operation is `extrapolate(|background|)`, + which repairs out-of-band lanes (mIsActive[k] == false) by applying + copysign(|background|, mValues[innerTap]) via an ascending-|Δ| + cascade. After `extrapolate` returns, every tap holds either its + true sidecar value (for active lanes) or a sign-corrected background + magnitude (for inactive lanes) — ready for WENO5 arithmetic. + + The inner-tap mapping is spelled out explicitly (Weno5-specific, + non-generic on purpose): + + |Δ|=1 taps --> inner = center tap (0,0,0) + |Δ|=2 taps --> inner = |Δ|=1 tap on the same axis + |Δ|=3 taps --> inner = |Δ|=2 tap on the same axis + + Cascade order (ascending-|Δ|) guarantees the inner tap is already + resolved when the outer tap is processed, so distance-3 taps inherit + sign via the |Δ|=1 → |Δ|=2 → |Δ|=3 chain without special casing. + + See BatchAccessor.md §11 for the full Phase-2 sidecar-WENO pipeline + design and §11.2 for the extrapolation semantics. +*/ + +#pragma once + +#include +#include // StencilPoint, Weno5Stencil, detail::findIndex + +#include +#include +#include +#include + +namespace nanovdb { + +template +class WenoStencil +{ +public: + using Taps = Weno5Stencil::Taps; + using Hull = Weno5Stencil::Hull; + static constexpr int SIZE = int(std::tuple_size_v); + + // Per-lane storage shape chosen by W: + // W == 1 : plain scalar (GPU thread-per-voxel model) + // W > 1 : W-wide array (CPU SIMD batch) + using ValueT = std::conditional_t; + using PredT = std::conditional_t; + + alignas(64) ValueT mValues [SIZE]; + alignas(64) PredT mIsActive[SIZE]; + + static constexpr int size() { return SIZE; } + + // Compile-time named-tap access: returns the index of tap (DI,DJ,DK) + // in the Taps tuple, matching StencilAccessor's convention. + template + static constexpr int tapIndex() + { + constexpr int I = detail::findIndex( + std::make_index_sequence{}); + static_assert(I >= 0, "WenoStencil::tapIndex: tap not in stencil"); + return I; + } + + // Replace out-of-band lanes (mIsActive[k][i] == false) of mValues[k] + // with copysign(absBackground, mValues[innerTap][i]). Active lanes + // are untouched. Center tap (0,0,0) is assumed always in-band and + // is not processed. + // + // Requires absBackground >= 0 (caller typically passes + // std::abs(floatGrid.background()) or sidecar[0] for a narrow-band + // level set where background > 0). + void extrapolate(float absBackground); + +private: + // Bridge W=1 (scalar reference) and W>1 (array decays to pointer). + // The address taken at W=1 is to the scalar member of mValues/mIsActive; + // at W>1 an array-to-pointer decay works without extra syntax. + static constexpr float* addr( ValueT& v) noexcept { + if constexpr (W == 1) return &v; else return v; + } + static constexpr const float* addr(const ValueT& v) noexcept { + if constexpr (W == 1) return &v; else return v; + } + static constexpr bool* addr( PredT& p) noexcept { + if constexpr (W == 1) return &p; else return p; + } + static constexpr const bool* addr(const PredT& p) noexcept { + if constexpr (W == 1) return &p; else return p; + } + + // Hardcoded (tap, innerTap) pairs for Weno5Stencil::Taps, ordered by + // ascending |Δ|. Indices match the tuple definition in StencilAccessor.h. + // + // idx 0 : center ( 0, 0, 0) + // idx 1.. 6 : x-axis (-3..+3) + // idx 7..12 : y-axis (-3..+3) + // idx 13..18 : z-axis (-3..+3) + static constexpr int kNumPairs = 18; + static constexpr int kPairs[kNumPairs][2] = { + // |Δ|=1 (inner tap = center, idx 0) + { 3, 0}, { 4, 0}, // x: -1, +1 + { 9, 0}, {10, 0}, // y: -1, +1 + {15, 0}, {16, 0}, // z: -1, +1 + // |Δ|=2 (inner tap = |Δ|=1 on same axis) + { 2, 3}, { 5, 4}, // x: -2<-(-1), +2<-(+1) + { 8, 9}, {11, 10}, // y + {14, 15}, {17, 16}, // z + // |Δ|=3 (inner tap = |Δ|=2 on same axis) + { 1, 2}, { 6, 5}, // x: -3<-(-2), +3<-(+2) + { 7, 8}, {12, 11}, // y + {13, 14}, {18, 17} // z + }; +}; + +// --------------------------------------------------------------------------- +// extrapolate — single-source implementation. +// +// Same body compiles for scalar (W=1) and SIMD (W>1): Simd.h's fixed_size<1> +// path collapses every instruction to a scalar store. The only non-uniform +// bit is the addr() helper above. +// --------------------------------------------------------------------------- +template +void WenoStencil::extrapolate(float absBackground) +{ + using FloatV = nanovdb::util::Simd ; + using MaskV = nanovdb::util::SimdMask; + + const FloatV absBg(absBackground); // broadcast + const FloatV zero (0.0f); + + for (int p = 0; p < kNumPairs; ++p) { + const int k = kPairs[p][0]; + const int kInner = kPairs[p][1]; + + const MaskV active(addr(mIsActive[k]), nanovdb::util::element_aligned); + const FloatV val (addr(mValues [k]), nanovdb::util::element_aligned); + const FloatV inner (addr(mValues [kInner]), nanovdb::util::element_aligned); + + // copysign(absBg, inner): +absBg if inner >= 0, else -absBg. + const MaskV isNegInner = zero > inner; + const FloatV extrap = nanovdb::util::where(isNegInner, -absBg, absBg); + + // Active lanes keep `val`; inactive lanes take `extrap`. + const FloatV result = nanovdb::util::where(active, val, extrap); + + nanovdb::util::store(result, addr(mValues[k]), nanovdb::util::element_aligned); + } +} + +} // namespace nanovdb From 0f8fccdb5c46df708976aa56e30f231e92b0c79b Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Mon, 20 Apr 2026 18:05:51 -0500 Subject: [PATCH 46/60] =?UTF-8?q?WenoStencil.md=20+=20BatchAccessor.md=20?= =?UTF-8?q?=C2=A711.6:=20Stage=201-3=20documentation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New WenoStencil.md captures the Stage-3 design rationale for nanovdb/nanovdb/util/WenoStencil.h: - §2 single-source scalar/SIMD via std::conditional_t storage (ValueT = float vs float[W], PredT = bool vs bool[W]) - §3 the addr() bridge for W=1 scalar-ref vs W>1 array-decay at SIMD load/store sites; rejected alternative (Simd.h reference overloads) and why - §4 extrapolation semantics (OOB problem, ascending-|Δ| cascade, hardcoded kPairs table for Weno5) - §5 extrapolate() implementation walkthrough + per-pair cost model - §6 API usage (fill / extrapolate / named-tap access) - §7 future work: reconstruct() method, Weno5Stencil policy consolidation, generic StencilPolicy parameterisation - §8 relation to BatchAccessor.md / StencilAccessor.md / HaloStencilAccessor.md BatchAccessor.md §11.6 (Implementation status) added to the "Target pipeline" section summarising Stage 1 (batch-restructure), Stage 2 (sidecar variants), and Stage 3 (WenoStencil + extrapolate) with commit SHAs, perf tables, and a trimmed "Remaining" list pointing at the Phase-3 WENO5 arithmetic kernel as the next step. Co-Authored-By: Claude Sonnet 4.6 --- nanovdb/nanovdb/util/BatchAccessor.md | 93 +++++++ nanovdb/nanovdb/util/WenoStencil.md | 354 ++++++++++++++++++++++++++ 2 files changed, 447 insertions(+) create mode 100644 nanovdb/nanovdb/util/WenoStencil.md diff --git a/nanovdb/nanovdb/util/BatchAccessor.md b/nanovdb/nanovdb/util/BatchAccessor.md index d90cc6dc5f..88f15a04af 100644 --- a/nanovdb/nanovdb/util/BatchAccessor.md +++ b/nanovdb/nanovdb/util/BatchAccessor.md @@ -1800,3 +1800,96 @@ Implementation items that follow directly from §11.1–§11.4: - **Batch-width ordering benchmark** — rerun the legacy/transposed comparison at W=4 and W=8 over floats (not uint64 indices) to lock in the Phase-2 loop order. + +### 11.6 Implementation status (Stage 1 / 2 / 3 landed) + +The target pipeline above is being landed in three incremental stages. +The first three are done; the WENO5 arithmetic + write-back phase is +the remaining gap before a full end-to-end advection step. + +#### Stage 1 — batch-structured outer loop + +`ex_stencil_gather_cpu` and `ex_narrowband_stencil_cpu`: the `legacy`, +`framing`, and `legacy-transposed` passes were restructured from a +flat `for i in 0..BlockWidth` loop into a nested +`for batchStart in 0..BlockWidth step SIMDw; for i in 0..SIMDw` loop. +The `stencil` pass was already batch-structured and served as the +reference shape. Rationale: every downstream phase (sidecar fetch, +extrapolate, WENO5 reconstruct) is SIMD-wide, so the outer loop has +to match that cadence first. + +Perf cost (narrowband, 24 threads): ~5% on legacy, ~7% on +legacy-transposed — the flat-128 loop previously gave GCC more +iterations over which to amortize loop overhead. Recovered in full +by subsequent stages. + +Commit: `5a920596`. + +#### Stage 2 — sidecar value assembly (uint64 → float via lookup) + +`ex_narrowband_stencil_cpu`: three new passes that assemble the +per-batch `float values[SIZE][SIMDw]` + `bool isActive[SIZE][SIMDw]` +matrices from the sidecar, plus a stand-in token op (sum of active +tap values per voxel, written to a second sidecar at the VBM-sequential +index). Variants: + +| Pass | Gather method | Time (ns/voxel) | +|------|---------------|----------------:| +| `sidecar-legacy` | LegacyStencilAccessor scalar moveTo | 4.1 | +| `sidecar-stencil` | StencilAccessor (hybrid SIMD + scalar tail) | **3.0** | +| `sidecar-transposed` | ReadAccessor tap-outer direct probeLeaf | 4.0 | + +All three produce identical output checksums (cross-validation). +StencilAccessor wins by ~25% over the scalar paths — its SIMD moveTo +amortises direction-decode + leaf-cache across the batch, and its +contiguous `mIndices[k][i]` row feeds a vector-friendly sidecar gather. + +Supporting change: `convertToIndexGridWithSidecar` now sets +`sidecar[0] = floatGrid.background()` so the sidecar fetch is +unconditional (no per-lane branch on `idx == 0`). + +Commit: `110d852c`. + +#### Stage 3 — out-of-band extrapolation via WenoStencil + +New header `nanovdb/nanovdb/util/WenoStencil.h` (design doc: +[WenoStencil.md](WenoStencil.md)) defines `WenoStencil`: a 19-tap +value + activity container templated on SIMD lane width, with a +single-source scalar/SIMD `extrapolate(absBackground)` method. The +extrapolation implements the cascade from §11.2: out-of-band lanes +take `copysign(|background|, mValues[innerTap][lane])`, processed in +ascending-|Δ| order so the inner tap is always already resolved. + +Integration: new `sidecar-stencil-extrap` pass in +`ex_narrowband_stencil_cpu` reuses StencilAccessor for the gather, +fills a `WenoStencil`, calls `extrapolate()`, then sums all +19 taps unconditionally (no longer gated by `isActive`). + +Measured extrapolation overhead: **+4.5 ms / 31.8M voxels += 0.14 ns/voxel** end-to-end on taperLER.vdb (24 threads, +i9-285K) — 18 SIMD blend pairs per batch, ~126 cycles per 16-voxel +batch, ~8 cycles/voxel per core. + +| Pass | ns/voxel | Checksum | +|------|---------:|----------| +| `sidecar-stencil` | 3.1 | `0xcfbff7c8` | +| `sidecar-stencil-extrap` | 3.2 | `0x371273d0` | + +Checksums differ as expected: the extrap variant sums +`mValues[k]` for all 19 taps after extrapolation, whereas +`sidecar-stencil` gates the sum by `isActive[k]` and so excludes +out-of-band lanes. + +Commit: `a6b08712`. + +#### Remaining (Stage 4+) + +The §11.5 "deliverables" list is now reduced to two items: + +- **Phase-3 WENO5 kernel** — `reconstruct()` method on + `WenoStencil` (or a free function consuming it) producing per-axis + fluxes via the Weno5 arithmetic, single-source across W. Sketched + in WenoStencil.md §7.1. +- **Batch-width ordering benchmark** — at W=4 / W=8 over floats, + to validate that the tap-outer fill hypothesis (§11.3) holds at + the real Phase-3 inner-loop size before locking the shape. diff --git a/nanovdb/nanovdb/util/WenoStencil.md b/nanovdb/nanovdb/util/WenoStencil.md new file mode 100644 index 0000000000..2a3899b45e --- /dev/null +++ b/nanovdb/nanovdb/util/WenoStencil.md @@ -0,0 +1,354 @@ +# WenoStencil — Single-Source Scalar/SIMD WENO5 Stencil Value Container + +Design reference for `nanovdb/nanovdb/util/WenoStencil.h`. Captures the +rationale behind the templated-on-lane-width class, the out-of-band +extrapolation algorithm, and the relationship to the broader Phase-2 +pipeline sketched in `BatchAccessor.md §11`. + +--- + +## 1. Motivation + +The WENO5 CPU pipeline (`BatchAccessor.md §11`) produces a per-batch +matrix of 19 tap values per voxel: + +``` +float values[Ntaps][W] -- real sidecar value, or background for OOB lanes +bool isActive[Ntaps][W] -- true iff the tap voxel is in the narrow band +``` + +The next pipeline phase consumes `values[tap][lane]` as 19 `Simd` +rows for the WENO5 reconstruction arithmetic. But before arithmetic, +out-of-band lanes must be repaired so that `values[tap][lane]` holds a +sensible float for every lane, not just the in-band ones. That repair +is the **extrapolation** step. + +`WenoStencil` encapsulates: + +1. Storage for the 19-tap × W-lane data + activity flags. +2. The extrapolation algorithm (ascending-|Δ| cascade, hardcoded tap pairs). +3. A pattern that keeps GPU (W=1) code textually identical to CPU SIMD + (W>1) code — only the storage shapes and the Simd.h backend differ. + +--- + +## 2. Single-source scalar/SIMD design + +### 2.1 The conditional_t storage trick + +```cpp +template +class WenoStencil +{ +public: + using ValueT = std::conditional_t; + using PredT = std::conditional_t; + + alignas(64) ValueT mValues [SIZE]; + alignas(64) PredT mIsActive[SIZE]; + /* ... */ +}; +``` + +| W | `mValues` expands to | `mIsActive` expands to | Intended use | +|---|---|---|---| +| 1 | `float mValues[19]` | `bool mIsActive[19]` | GPU thread-per-voxel — 19 scalar registers | +| 16 | `float mValues[19][16]` | `bool mIsActive[19][16]` | CPU SIMD batch — 19 YMM-tiles in L1 | + +Same declaration syntax, different expansion. Memory layout at W>1 is +identical to writing `float mValues[SIZE][W]` directly — no performance +difference, just a cleaner scalar case at W=1. + +### 2.2 Why the same source compiles to good scalar and SIMD code + +`extrapolate()`'s body uses only `nanovdb::util::Simd` primitives: + +- load / store (ctor + free `store`) +- `operator>` (produces `SimdMask`) +- unary `operator-` +- `where(mask, a, b)` — 3-arg blend + +All of these exist in Simd.h's scalar degenerate (`fixed_size<1>` in the +stdx backend, 1-element array in the array backend). At W=1 the +compiler inlines and collapses every operation to a plain scalar +instruction. **One source body, two target ISAs, no `if constexpr` +branches on W inside the algorithm.** + +--- + +## 3. The `addr()` bridge + +One asymmetry survives the `conditional_t` unification: at W=1, +`mValues[k]` is a scalar `float` value (not an array), so it does not +decay to `float*` — `Simd::load(mValues[k], flags)` wouldn't +type-check. At W>1, `mValues[k]` is a `float[W]` array and decays +naturally. + +A private `addr()` helper papers over this in one place: + +```cpp +static constexpr float* addr(ValueT& v) noexcept { + if constexpr (W == 1) return &v; else return v; +} +``` + +Callers (inside `extrapolate()`) always write: + +```cpp +FloatV val(addr(mValues[k]), element_aligned); +``` + +and get a uniform expression that works at any W. There are four +overloads (`ValueT&` / `const ValueT&` / `PredT&` / `const PredT&`); +all are `constexpr` and compile to a no-op at W>1 and a trivial address +fetch at W=1. + +**Alternative considered (rejected):** overloading `Simd::load` +to accept `T&` at W=1. Blocked by the stdx backend's type-alias +representation (`using Simd = stdx::fixed_size_simd`) — we can't add +member ctors to an alias. The equivalent free-function workaround +turned out no shorter than the `addr()` helper and would have forced +Simd.h churn for a benefit scoped to one class. See the Stage-3 design +exchange for the full discussion. + +--- + +## 4. Extrapolation semantics + +### 4.1 The out-of-band problem + +For a narrow-band SDF, only the center tap `<0,0,0>` is guaranteed to +be in the active narrow band. Every other tap may land outside the +band for some lanes of a batch — for those lanes, `idx == 0` in the +sidecar fill, so `values[k][lane] = sidecar[0] = |background|` and +`isActive[k][lane] = false`. + +Applying WENO5 arithmetic directly to the `|background|` magnitude +produces wrong gradients at the band boundary: the reconstructed field +would not track the sign of the underlying signed distance function. +The standard fix is to **extrapolate from the next inner tap's sign**: + +``` +if (!isActive[k][lane]) + values[k][lane] = copysign(|background|, values[innerTap][lane]) +``` + +The `|background|` magnitude is preserved; the sign is copied from +whichever "inner" tap (one step closer to center along the same axis) +best represents which side of the surface this lane belongs to. + +### 4.2 Inner-tap cascade — ascending |Δ| order + +| Outer tap |Δ| | Inner tap (source of sign) | +|---|---| +| `<±1,0,0>`, `<0,±1,0>`, `<0,0,±1>` | center `<0,0,0>` | +| `<±2,0,0>`, `<0,±2,0>`, `<0,0,±2>` | `<±1,0,0>`, `<0,±1,0>`, `<0,0,±1>` | +| `<±3,0,0>`, `<0,±3,0>`, `<0,0,±3>` | `<±2,0,0>`, `<0,±2,0>`, `<0,0,±2>` | + +Processing taps in ascending-|Δ| order guarantees the inner tap is +already resolved (real value or previously extrapolated) when the outer +tap is processed. Sign propagation through a |Δ|=1 → |Δ|=2 → |Δ|=3 +chain is automatic — no special casing. + +### 4.3 The `kPairs[]` table + +The inner-tap relationship is `Weno5Stencil`-specific and hardcoded as +a static table inside the class: + +```cpp +static constexpr int kNumPairs = 18; +static constexpr int kPairs[kNumPairs][2] = { + // |Δ|=1 (inner = center, idx 0) + {3,0},{4,0},{9,0},{10,0},{15,0},{16,0}, + // |Δ|=2 (inner = |Δ|=1 on same axis) + {2,3},{5,4},{8,9},{11,10},{14,15},{17,16}, + // |Δ|=3 (inner = |Δ|=2 on same axis) + {1,2},{6,5},{7,8},{12,11},{13,14},{18,17} +}; +``` + +Indices match the tuple ordering in `Weno5Stencil::Taps` +(`StencilAccessor.h`). Center tap (idx 0) is not processed — assumed +always in-band. + +**Why hardcoded, not template-derived:** a generic scheme would walk +`Weno5Stencil::Taps` at compile time and derive inner-tap indices from +|Δ| and axis alignment. For a single stencil (Weno5) this is +over-engineering: the table is 18 entries, reads directly, and makes +the cascade ordering self-documenting. Worth revisiting if we add +Weno7 or other axis-aligned WENO variants. + +--- + +## 5. Extrapolate — implementation + +```cpp +template +void WenoStencil::extrapolate(float absBackground) +{ + using FloatV = nanovdb::util::Simd ; + using MaskV = nanovdb::util::SimdMask; + + const FloatV absBg(absBackground); + const FloatV zero (0.0f); + + for (int p = 0; p < kNumPairs; ++p) { + const int k = kPairs[p][0]; + const int kInner = kPairs[p][1]; + + const MaskV active(addr(mIsActive[k]), element_aligned); + const FloatV val (addr(mValues [k]), element_aligned); + const FloatV inner (addr(mValues [kInner]), element_aligned); + + // copysign(absBg, inner): +absBg if inner >= 0, else -absBg. + const MaskV isNegInner = zero > inner; + const FloatV extrap = where(isNegInner, -absBg, absBg); + + // Active lanes keep `val`; inactive lanes take `extrap`. + const FloatV result = where(active, val, extrap); + store(result, addr(mValues[k]), element_aligned); + } +} +``` + +**Per-pair cost (W=16, AVX2):** + +| Op | Cycles (est.) | +|----|--------------:| +| 3× load (mIsActive, mValues[k], mValues[kInner]) | 3 | +| `0 > inner` (vcmpltps + sign mask) | 1 | +| `where(isNegInner, -absBg, absBg)` (vblendvps) | 1 | +| `where(active, val, extrap)` (mask convert + vblendvps) | 1–2 | +| 1× store (mValues[k]) | 1 | +| **≈ 7 cycles / pair** | + +Total: 18 pairs × 7 cycles = ~126 cycles per call. Amortised over +W=16 lanes gives ~8 cycles/voxel, or ~2 ns/voxel on a 4 GHz core. + +Measured overhead in `sidecar-stencil-extrap` pass on taperLER.vdb +(24 threads): **+4.5 ms / 31.8M voxels = 0.14 ns/voxel** end-to-end — +lines up with the per-core estimate divided by thread count (24× +speedup ≈ 2 / 24 ≈ 0.083 ns; measurement includes framing overhead). + +**Skipping active lanes:** the algorithm reads and computes for every +lane regardless of `isActive`. For active lanes, `extrap` is computed +but then discarded by the final `where`. This wasted work is cheaper +than a predicated-store alternative because: + +- The SIMD blend is one instruction (`vblendvps`). +- Per-lane branching would serialize the batch. +- Active-fraction is high (~90% on narrow-band SDFs), so masked + computation saves little even in the best case. + +--- + +## 6. API usage + +### 6.1 Filling the stencil from sidecar indices + +```cpp +WenoStencil stencil; +StencilAccessor acc(grid, ...); +acc.moveTo(leafIndex + batchStart, voxelOffset + batchStart); + +for (int k = 0; k < WenoStencil::size(); ++k) { + for (int i = 0; i < SIMDw; ++i) { + const uint64_t idx = acc.mIndices[k][i]; + stencil.mValues [k][i] = sidecar[idx]; // sidecar[0] = background + stencil.mIsActive[k][i] = (idx != 0); + } +} +``` + +At W=1 (GPU per-thread) the same body would just drop the `[i]` index: + +```cpp +stencil.mValues [k] = sidecar[idx]; +stencil.mIsActive[k] = (idx != 0); +``` + +### 6.2 Extrapolating + +```cpp +stencil.extrapolate(std::abs(floatGrid.background())); +``` + +After this call, every `stencil.mValues[k][i]` holds either the real +sidecar value (for active lanes) or a sign-corrected `|background|` +(for inactive lanes). `mIsActive[]` is no longer needed downstream. + +### 6.3 Compile-time named-tap access + +```cpp +constexpr int ctr = WenoStencil::tapIndex<0, 0, 0>(); +float centerValue = stencil.mValues[ctr][i]; + +constexpr int xm3 = WenoStencil::tapIndex<-3, 0, 0>(); +// ... etc, for WENO5 arithmetic +``` + +`tapIndex()` forwards to `detail::findIndex` (shared with +`StencilAccessor`), static-asserting at compile time that the requested +tap exists in the Weno5Stencil::Taps tuple. + +--- + +## 7. Future work + +### 7.1 WENO5 reconstruction method + +The class's second substantive operation (not yet implemented) will be +the WENO5 arithmetic itself — a compile-time fold over the 19 tap +rows, producing three `Simd` fluxes (one per axis) from the +fully-resolved `mValues` matrix. Natural signature: + +```cpp +struct Weno5Flux { FloatV dx, dy, dz; }; +Weno5Flux reconstruct() const; +``` + +Adopting the same single-source structure: at W=1 the fluxes collapse +to scalars; at W>1 they are SIMD vectors. + +### 7.2 Consolidate the Weno5Stencil policy + +Currently `Weno5Stencil` (the tap-tuple policy struct) lives in +`StencilAccessor.h` and is shared with `WenoStencil` via +`using Taps = Weno5Stencil::Taps`. The policy is arguably a +Weno-specific definition and could move into `WenoStencil.h`; +`StencilAccessor.h` would then `#include <.../WenoStencil.h>` for the +policy. Left as-is for this pass to minimise Stage-3 churn. + +### 7.3 Alternative stencils + +If/when Weno7 or a non-axis-aligned stencil is needed, the class would +specialise on a stencil-policy template parameter rather than hardcode +`Weno5Stencil`: + +```cpp +template +class AxisAlignedStencil { /* derive kPairs at compile time */ }; +``` + +The `kPairs` table would be generated from `StencilPolicy::Taps` via a +constexpr pass that finds, for each tap, the same-axis neighbour with +|Δ| = |tap.Δ| − 1. Not needed until a second axis-aligned stencil +exists. + +--- + +## 8. Relationship to other design docs + +- **`BatchAccessor.md §11`** — the broader Phase-2 pipeline plan + (VBM decode → sidecar assembly → extrapolation → WENO arithmetic → + write-back). WenoStencil implements the "extrapolation" step and + provides the storage that carries data from "sidecar assembly" into + the future "WENO arithmetic" step. +- **`StencilAccessor.md`** — Phase-1 accessor (batched uint64 index + gather). StencilAccessor fills `mIndices[SIZE][W]`; WenoStencil + consumes those indices (via `sidecar[idx]` in user code) and owns + the per-lane float result. +- **`HaloStencilAccessor.md`** — speculative alternative that + precomputes a dense float halo buffer; if that path is pursued, + WenoStencil would fill from the halo instead of from sidecar + indices. The extrapolation algorithm here transfers unchanged. From 3c33d85d5b30c0ea4c38f8fce6e7e24b36522add Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Mon, 20 Apr 2026 23:03:33 -0500 Subject: [PATCH 47/60] WenoStencil: Simd-typed storage + normSqGrad; caller-owned fill buffers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refactor WenoStencil to hold first-class Simd values directly, rather than raw float[W]/bool[W] C arrays with an addr() bridge at every SIMD load/store site. The class becomes pure-compute: no fill-side storage, no addr() ceremony, no element_aligned tags inside the kernel bodies. The caller owns whatever scalar-scatter target shape is natural for its source (typically a pair of stack-local alignas(64) C arrays at CPU W>1; no intermediate buffer at all on CUDA W=1). FloatV values [SIZE]; // Simd — first-class storage MaskV isActive[SIZE]; // SimdMask float mDx2, mInvDx2; // scalar grid constants, broadcast on use At W=1 the Simd types collapse to plain float/bool under the array backend — the CUDA per-thread code path reads as pure scalar arithmetic, which was the primary motivation for the refactor: scalar is the production workhorse on GPU, not a degenerate convenience of the CPU SIMD design. The earlier Approach-1 design (std::conditional_t + addr() helper) forced W=1 to load `Simd` via address-of- scalar, which propagated SIMD-ish ceremony into every __hostdev__ method. Approach 2 sidesteps this: at W=1 there is no Simd wrapper at the storage level at all. Also adds `normSqGrad(iso = 0.f)`, the fifth-order upwind Godunov norm-square-gradient method. Line-for-line transliteration of the scalar ground-truth `WenoStencil::normSqGrad(isoValue)` in `nanovdb/math/Stencils.h`, adapted to generic-T: - Six axial WENO5 reconstructions via free-function `nanovdb::detail::WENO5` — mirrors `nanovdb::math::WENO5` identically, wrapping integer literals in RealT() for Simd broadcast. - `nanovdb::detail::GodunovsNormSqrd` computes both outside/inside branches unconditionally and blends via util::where; replaces the scalar ground-truth's runtime if/else so the SIMD path has no control-flow divergence across lanes. - Final scalar mInvDx2 broadcast to FloatV at the single combinator multiplication (free on x86; identity at W=1). sidecar-stencil-extrap pass in ex_narrowband_stencil_cpu updated to the new API: - Caller allocates stack-local raw_values[SIZE][SIMDw] / raw_active[SIZE][SIMDw] inside the TBB task body. - Scalar scatter fill populates the raw buffers from the sidecar via stencilAcc.mIndices[k][i]. - SIMD load-per-tap moves the data into stencil.values[k] / stencil.isActive[k]. - stencil.extrapolate(absBackground) operates in-place on the Simd values. - The token sum over 19 taps now runs as a pure Simd accumulate, then stores to a stack-local sum_lanes[SIMDw] for the per-voxel scalar write to the output sidecar (gated by leafIndex). Checksum matches byte-for-byte (0x371273d0 on taperLER.vdb), timing within noise of the pre-refactor baseline (~82 ms / 2.6 ns/voxel end-to-end on 24 threads). WenoStencil.md rewritten: §2 now describes Simd-typed storage and the caller-owned fill-buffer convention (the addr() bridge section is removed); §4 documents normSqGrad including the generic-T WENO5 / GodunovsNormSqrd helpers and their relationship to the scalar ground-truth; §5.1/§5.2 show the two usage patterns (CPU SIMD with raw buffers + explicit load; CUDA scalar with direct per-tap assignment); §6 diagrams the ownership boundaries; §7.1 flags that normSqGrad perf hasn't been measured yet. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Efty Sifakis --- .../narrowband_stencil_cpu.cpp | 34 +- nanovdb/nanovdb/util/WenoStencil.h | 283 ++++++---- nanovdb/nanovdb/util/WenoStencil.md | 491 +++++++++++------- 3 files changed, 530 insertions(+), 278 deletions(-) diff --git a/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp b/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp index 927163e5be..6ccef74178 100644 --- a/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp +++ b/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp @@ -746,8 +746,16 @@ static void runPerf( alignas(64) uint32_t leafIndex[BlockWidth]; alignas(64) uint16_t voxelOffset[BlockWidth]; + // Caller-owned fill-side scratch — scalar scatter writes from + // the sidecar land here, then a per-tap SIMD load moves the + // data into the stencil's Simd-typed compute view. + alignas(64) float raw_values[nanovdb::WenoStencil::size()][SIMDw]; + alignas(64) bool raw_active[nanovdb::WenoStencil::size()][SIMDw]; + nanovdb::WenoStencil stencil; constexpr int SIZE = nanovdb::WenoStencil::size(); + using FloatV = nanovdb::util::Simd ; + using MaskV = nanovdb::util::SimdMask; const float* const scIn = sidecar.data(); float* const scOut = outputSidecar.data(); @@ -771,23 +779,37 @@ static void runPerf( for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { stencilAcc.moveTo(leafIndex + batchStart, voxelOffset + batchStart); + // Scalar scatter fill into caller-owned C arrays. for (int k = 0; k < SIZE; ++k) { for (int i = 0; i < SIMDw; ++i) { const uint64_t idx = stencilAcc.mIndices[k][i]; - stencil.mValues [k][i] = scIn[idx]; - stencil.mIsActive[k][i] = (idx != 0); + raw_values[k][i] = scIn[idx]; + raw_active[k][i] = (idx != 0); } } + // SIMD load-per-tap into the stencil's compute view. + for (int k = 0; k < SIZE; ++k) { + stencil.values [k] = FloatV(raw_values[k], nanovdb::util::element_aligned); + stencil.isActive[k] = MaskV (raw_active[k], nanovdb::util::element_aligned); + } + + // Arithmetic — reads/writes stencil.values[] as Simd in place. stencil.extrapolate(absBackground); + // Token sum over all 19 taps, entirely in Simd form. + FloatV sum(0.f); + for (int k = 0; k < SIZE; ++k) sum = sum + stencil.values[k]; + + // Simd → scalar bridge at the output side, mirroring the + // fill-side bridge: SIMD store into a scratch, then per-lane + // scalar write to the output sidecar (gated by leafIndex). + alignas(64) float sum_lanes[SIMDw]; + nanovdb::util::store(sum, sum_lanes, nanovdb::util::element_aligned); for (int i = 0; i < SIMDw; ++i) { const int p = batchStart + i; if (leafIndex[p] == CPUVBM::UnusedLeafIndex) continue; - float sum = 0.f; - for (int k = 0; k < SIZE; ++k) - sum += stencil.mValues[k][i]; - scOut[blockBase + p] = sum; + scOut[blockBase + p] = sum_lanes[i]; } } } diff --git a/nanovdb/nanovdb/util/WenoStencil.h b/nanovdb/nanovdb/util/WenoStencil.h index d5069d7411..f1aba71545 100644 --- a/nanovdb/nanovdb/util/WenoStencil.h +++ b/nanovdb/nanovdb/util/WenoStencil.h @@ -4,76 +4,160 @@ /*! \file WenoStencil.h - \brief 19-tap WENO5 stencil data container + per-tap out-of-band - extrapolation, templated on SIMD lane width. + \brief 19-tap WENO5 stencil value container + out-of-band extrapolation + + fifth-order upwind Godunov's norm-square gradient. Templated on + SIMD lane width W. - `WenoStencil` holds the per-tap float values and per-tap activity - flags for a single voxel (W=1, scalar / GPU-friendly) or a batch of W - voxels (W>1, CPU SIMD). The underlying element types switch via - `std::conditional_t`: + `WenoStencil` holds the per-tap float values and activity flags for a + single voxel (W=1, scalar / GPU-friendly) or a batch of W voxels (W>1, + CPU SIMD). Storage is first-class Simd types directly: - W == 1 : ValueT = float PredT = bool - W > 1 : ValueT = float[W] PredT = bool[W] + FloatV values [19] ≡ Simd values [19] + MaskV isActive[19] ≡ SimdMask isActive[19] - Storage is a plain C array (`ValueT mValues[SIZE]`) so the caller can - fill it lane-by-lane with the same scalar syntax in both cases - (`s.mValues[k][i] = ...` at W>1; `s.mValues[k] = ...` at W=1). + At W=1 the Simd types collapse to plain float / bool so scalar CUDA code + reads as plain scalar arithmetic, and the class is pure-compute — the + caller owns any fill-side C-array storage it wants to use for scalar + scatter before an explicit load-into-Simd step. - The class's one substantive operation is `extrapolate(|background|)`, - which repairs out-of-band lanes (mIsActive[k] == false) by applying - copysign(|background|, mValues[innerTap]) via an ascending-|Δ| - cascade. After `extrapolate` returns, every tap holds either its - true sidecar value (for active lanes) or a sign-corrected background - magnitude (for inactive lanes) — ready for WENO5 arithmetic. + Grid-spacing scalars `mDx2` and `mInvDx2` stay scalar `float` at every + W and are broadcast to FloatV at the point of use. - The inner-tap mapping is spelled out explicitly (Weno5-specific, - non-generic on purpose): + Operations provided: + - extrapolate(absBackground) + Repair out-of-band lanes (isActive[k] == false) via + copysign(absBackground, values[innerTap]), processed in + ascending-|Δ| order so the inner tap is already resolved. + - normSqGrad(isoValue = 0) + Godunov's norm-square of the fifth-order WENO upwind gradient. + Matches the semantics of WenoStencil::normSqGrad(isoValue) in + nanovdb/math/Stencils.h (the ground-truth scalar reference). - |Δ|=1 taps --> inner = center tap (0,0,0) - |Δ|=2 taps --> inner = |Δ|=1 tap on the same axis - |Δ|=3 taps --> inner = |Δ|=2 tap on the same axis - - Cascade order (ascending-|Δ|) guarantees the inner tap is already - resolved when the outer tap is processed, so distance-3 taps inherit - sign via the |Δ|=1 → |Δ|=2 → |Δ|=3 chain without special casing. - - See BatchAccessor.md §11 for the full Phase-2 sidecar-WENO pipeline - design and §11.2 for the extrapolation semantics. + See BatchAccessor.md §11 for the full Phase-2 sidecar-WENO pipeline design + and §11.2 for the extrapolation semantics. */ #pragma once #include #include // StencilPoint, Weno5Stencil, detail::findIndex +#include // Pow2 #include #include -#include #include namespace nanovdb { +namespace detail { + +// --------------------------------------------------------------------------- +// Generic-T WENO5 reconstruction — templated on T ∈ {float, Simd}. +// +// Structurally identical to nanovdb::math::WENO5 (ground-truth scalar WENO5 +// in nanovdb/math/Stencils.h), transliterated to use only primitives that +// exist for both scalar T=float and Simd: operator+/-/*, math::Pow2. +// No ternaries, no if/else — same source compiles to scalar or SIMD code +// via the Simd backend in Simd.h. +// +// scale2 is the optional reference magnitude (squared) used to scale the +// numerical epsilon; kept as a plain float for broadcast-on-demand. +// --------------------------------------------------------------------------- +template +NANOVDB_SIMD_HOSTDEV inline T WENO5(const T& v1, const T& v2, const T& v3, + const T& v4, const T& v5, + float scale2 = 1.f) +{ + const RealT C = RealT(13.f / 12.f); + const RealT eps = RealT(1.e-6f * scale2); + + const RealT A1 = RealT(0.1f) / math::Pow2( + C * math::Pow2(v1 - RealT(2)*v2 + v3) + + RealT(0.25f) * math::Pow2(v1 - RealT(4)*v2 + RealT(3)*v3) + eps); + const RealT A2 = RealT(0.6f) / math::Pow2( + C * math::Pow2(v2 - RealT(2)*v3 + v4) + + RealT(0.25f) * math::Pow2(v2 - v4) + eps); + const RealT A3 = RealT(0.3f) / math::Pow2( + C * math::Pow2(v3 - RealT(2)*v4 + v5) + + RealT(0.25f) * math::Pow2(RealT(3)*v3 - RealT(4)*v4 + v5) + eps); + + return (A1 * (RealT( 2)*v1 - RealT(7)*v2 + RealT(11)*v3) + + A2 * (RealT( 5)*v3 - v2 + RealT( 2)*v4) + + A3 * (RealT( 2)*v3 + RealT(5)*v4 - v5)) + / (RealT(6) * (A1 + A2 + A3)); +} + +// --------------------------------------------------------------------------- +// Generic-T Godunov's norm-square gradient — templated on T (value type) and +// MaskT (mask type that `>` of T produces). Ground-truth scalar version is +// nanovdb::math::GodunovsNormSqrd in nanovdb/math/Stencils.h, which uses a +// runtime if/else on `isOutside`. Here we compute both branches uncondition- +// ally and blend via util::where, so the SIMD path has no control-flow +// divergence across lanes. At T=float the scalar where(bool, T, T) overload +// degenerates this to the same semantics as the if/else. +// --------------------------------------------------------------------------- +template +NANOVDB_SIMD_HOSTDEV inline T GodunovsNormSqrd(MaskT isOutside, + T dP_xm, T dP_xp, + T dP_ym, T dP_yp, + T dP_zm, T dP_zp) +{ + using util::min; using util::max; using util::where; + const T zero(0.f); + + const T outside = max(math::Pow2(max(dP_xm, zero)), math::Pow2(min(dP_xp, zero))) // (dP/dx)² + + max(math::Pow2(max(dP_ym, zero)), math::Pow2(min(dP_yp, zero))) // (dP/dy)² + + max(math::Pow2(max(dP_zm, zero)), math::Pow2(min(dP_zp, zero))); // (dP/dz)² + + const T inside = max(math::Pow2(min(dP_xm, zero)), math::Pow2(max(dP_xp, zero))) + + max(math::Pow2(min(dP_ym, zero)), math::Pow2(max(dP_yp, zero))) + + max(math::Pow2(min(dP_zm, zero)), math::Pow2(max(dP_zp, zero))); + + return where(isOutside, outside, inside); +} + +} // namespace detail + +// --------------------------------------------------------------------------- +// WenoStencil — pure-compute container for a 19-tap WENO5 stencil state. +// +// The class holds only Simd-typed compute state + scalar grid constants. +// Fill-side responsibility (scalar writes into any raw float/bool buffers, +// followed by a SIMD load-per-tap into this stencil's values[] / isActive[]) +// lives in the caller. See WenoStencil.md §6 for usage patterns. +// --------------------------------------------------------------------------- template class WenoStencil { public: - using Taps = Weno5Stencil::Taps; - using Hull = Weno5Stencil::Hull; - static constexpr int SIZE = int(std::tuple_size_v); + static constexpr int SIZE = 19; - // Per-lane storage shape chosen by W: - // W == 1 : plain scalar (GPU thread-per-voxel model) - // W > 1 : W-wide array (CPU SIMD batch) - using ValueT = std::conditional_t; - using PredT = std::conditional_t; + using Taps = Weno5Stencil::Taps; + using FloatV = util::Simd ; + using MaskV = util::SimdMask; - alignas(64) ValueT mValues [SIZE]; - alignas(64) PredT mIsActive[SIZE]; + // Compute-side storage — first-class Simd values. At W=1 these collapse + // to plain scalar float / bool under the array backend. + FloatV values [SIZE]; + MaskV isActive[SIZE]; + + // Runtime grid-spacing constants — plain scalars at every W, broadcast + // to FloatV at the use sites inside normSqGrad(). Storing them as + // scalars saves YMM-register pressure (vbroadcastss folds into the FMA + // consumer on x86) and keeps the W=1 code path free of any Simd wrapper. + float mDx2{1.f}; // dx² — fed to WENO5's epsilon via scale2 + float mInvDx2{1.f}; // 1 / dx² — final normalisation in normSqGrad + + NANOVDB_SIMD_HOSTDEV WenoStencil() = default; + NANOVDB_SIMD_HOSTDEV explicit WenoStencil(float dx) + : mDx2(dx * dx), mInvDx2(1.f / (dx * dx)) {} static constexpr int size() { return SIZE; } - // Compile-time named-tap access: returns the index of tap (DI,DJ,DK) - // in the Taps tuple, matching StencilAccessor's convention. + // Compile-time named-tap access: returns the index of tap (DI,DJ,DK) in + // the Taps tuple. Ordering matches WenoPt::idx in + // nanovdb/math/Stencils.h, so this is interoperable with canonical WENO + // index conventions. template static constexpr int tapIndex() { @@ -83,38 +167,42 @@ class WenoStencil return I; } - // Replace out-of-band lanes (mIsActive[k][i] == false) of mValues[k] - // with copysign(absBackground, mValues[innerTap][i]). Active lanes - // are untouched. Center tap (0,0,0) is assumed always in-band and - // is not processed. + // ------------------------------------------------------------------ + // extrapolate — repair out-of-band lanes (isActive[k][i] == false) of + // values[k] with copysign(absBackground, values[innerTap][i]). Active + // lanes are preserved. Center tap (idx 0) is assumed always in-band + // and is not processed. + // + // Processes 18 (tap, innerTap) pairs in ascending-|Δ| order so the + // inner tap is already resolved when the outer tap is touched; + // sign-inheritance through |Δ|=1 → |Δ|=2 → |Δ|=3 is automatic. // - // Requires absBackground >= 0 (caller typically passes - // std::abs(floatGrid.background()) or sidecar[0] for a narrow-band - // level set where background > 0). - void extrapolate(float absBackground); + // Requires absBackground ≥ 0. + // ------------------------------------------------------------------ + NANOVDB_SIMD_HOSTDEV void extrapolate(float absBackground); -private: - // Bridge W=1 (scalar reference) and W>1 (array decays to pointer). - // The address taken at W=1 is to the scalar member of mValues/mIsActive; - // at W>1 an array-to-pointer decay works without extra syntax. - static constexpr float* addr( ValueT& v) noexcept { - if constexpr (W == 1) return &v; else return v; - } - static constexpr const float* addr(const ValueT& v) noexcept { - if constexpr (W == 1) return &v; else return v; - } - static constexpr bool* addr( PredT& p) noexcept { - if constexpr (W == 1) return &p; else return p; - } - static constexpr const bool* addr(const PredT& p) noexcept { - if constexpr (W == 1) return &p; else return p; - } + // ------------------------------------------------------------------ + // normSqGrad — Godunov's norm-square of the fifth-order WENO upwind + // gradient at the stencil center. Returns |∇φ|². + // + // Semantics match WenoStencil::normSqGrad(isoValue) in + // nanovdb/math/Stencils.h line-for-line: six axial WENO5 reconstructions + // (one pair ±x, ±y, ±z), then Godunov's upwind combinator driven by the + // sign of (center − iso). + // + // Call only after the stencil has been populated (see usage pattern in + // WenoStencil.md §6). extrapolate() is idempotent w.r.t. this — calling + // normSqGrad after extrapolate is the typical pipeline shape, but the + // method itself does not require extrapolate to have been called. + // ------------------------------------------------------------------ + NANOVDB_SIMD_HOSTDEV FloatV normSqGrad(float iso = 0.f) const; +private: // Hardcoded (tap, innerTap) pairs for Weno5Stencil::Taps, ordered by // ascending |Δ|. Indices match the tuple definition in StencilAccessor.h. // // idx 0 : center ( 0, 0, 0) - // idx 1.. 6 : x-axis (-3..+3) + // idx 1.. 6 : x-axis (-3..+3 in the order -3,-2,-1,+1,+2,+3) // idx 7..12 : y-axis (-3..+3) // idx 13..18 : z-axis (-3..+3) static constexpr int kNumPairs = 18; @@ -124,11 +212,11 @@ class WenoStencil { 9, 0}, {10, 0}, // y: -1, +1 {15, 0}, {16, 0}, // z: -1, +1 // |Δ|=2 (inner tap = |Δ|=1 on same axis) - { 2, 3}, { 5, 4}, // x: -2<-(-1), +2<-(+1) + { 2, 3}, { 5, 4}, // x: -2 ← (-1), +2 ← (+1) { 8, 9}, {11, 10}, // y {14, 15}, {17, 16}, // z // |Δ|=3 (inner tap = |Δ|=2 on same axis) - { 1, 2}, { 6, 5}, // x: -3<-(-2), +3<-(+2) + { 1, 2}, { 6, 5}, // x: -3 ← (-2), +3 ← (+2) { 7, 8}, {12, 11}, // y {13, 14}, {18, 17} // z }; @@ -137,36 +225,55 @@ class WenoStencil // --------------------------------------------------------------------------- // extrapolate — single-source implementation. // -// Same body compiles for scalar (W=1) and SIMD (W>1): Simd.h's fixed_size<1> -// path collapses every instruction to a scalar store. The only non-uniform -// bit is the addr() helper above. +// values[] and isActive[] are already Simd-typed; the algorithm is a +// sequence of whole-SIMD blends (plus a broadcast of absBg) per pair. +// Same source body compiles at W=1 (Simd collapses to scalar) +// and W>1 (native SIMD width). // --------------------------------------------------------------------------- template -void WenoStencil::extrapolate(float absBackground) +NANOVDB_SIMD_HOSTDEV void WenoStencil::extrapolate(float absBackground) { - using FloatV = nanovdb::util::Simd ; - using MaskV = nanovdb::util::SimdMask; - - const FloatV absBg(absBackground); // broadcast - const FloatV zero (0.0f); + const FloatV absBg(absBackground); + const FloatV zero (0.f); for (int p = 0; p < kNumPairs; ++p) { const int k = kPairs[p][0]; const int kInner = kPairs[p][1]; - const MaskV active(addr(mIsActive[k]), nanovdb::util::element_aligned); - const FloatV val (addr(mValues [k]), nanovdb::util::element_aligned); - const FloatV inner (addr(mValues [kInner]), nanovdb::util::element_aligned); - // copysign(absBg, inner): +absBg if inner >= 0, else -absBg. - const MaskV isNegInner = zero > inner; - const FloatV extrap = nanovdb::util::where(isNegInner, -absBg, absBg); - - // Active lanes keep `val`; inactive lanes take `extrap`. - const FloatV result = nanovdb::util::where(active, val, extrap); + const MaskV isNegInner = zero > values[kInner]; + const FloatV extrap = util::where(isNegInner, -absBg, absBg); - nanovdb::util::store(result, addr(mValues[k]), nanovdb::util::element_aligned); + // Active lanes keep their own value; inactive lanes take the extrapolated sign-corrected background. + values[k] = util::where(isActive[k], values[k], extrap); } } +// --------------------------------------------------------------------------- +// normSqGrad — Godunov's upwind WENO norm-square gradient. +// +// Structurally mirrors WenoStencil::normSqGrad(isoValue) in +// nanovdb/math/Stencils.h: six axial WENO5 reconstructions driving +// GodunovsNormSqrd. Tap indices 0..18 match WenoPt::idx in that +// file. mInvDx2 and iso are broadcast to FloatV at the final +// combinator only (free on x86; identity at W=1). +// --------------------------------------------------------------------------- +template +NANOVDB_SIMD_HOSTDEV typename WenoStencil::FloatV +WenoStencil::normSqGrad(float iso) const +{ + const FloatV* v = values; + + const FloatV dP_xm = detail::WENO5(v[ 2]-v[ 1], v[ 3]-v[ 2], v[ 0]-v[ 3], v[ 4]-v[ 0], v[ 5]-v[ 4], mDx2); + const FloatV dP_xp = detail::WENO5(v[ 6]-v[ 5], v[ 5]-v[ 4], v[ 4]-v[ 0], v[ 0]-v[ 3], v[ 3]-v[ 2], mDx2); + const FloatV dP_ym = detail::WENO5(v[ 8]-v[ 7], v[ 9]-v[ 8], v[ 0]-v[ 9], v[10]-v[ 0], v[11]-v[10], mDx2); + const FloatV dP_yp = detail::WENO5(v[12]-v[11], v[11]-v[10], v[10]-v[ 0], v[ 0]-v[ 9], v[ 9]-v[ 8], mDx2); + const FloatV dP_zm = detail::WENO5(v[14]-v[13], v[15]-v[14], v[ 0]-v[15], v[16]-v[ 0], v[17]-v[16], mDx2); + const FloatV dP_zp = detail::WENO5(v[18]-v[17], v[17]-v[16], v[16]-v[ 0], v[ 0]-v[15], v[15]-v[14], mDx2); + + return FloatV(mInvDx2) * + detail::GodunovsNormSqrd(v[0] > FloatV(iso), + dP_xm, dP_xp, dP_ym, dP_yp, dP_zm, dP_zp); +} + } // namespace nanovdb diff --git a/nanovdb/nanovdb/util/WenoStencil.md b/nanovdb/nanovdb/util/WenoStencil.md index 2a3899b45e..98585cd478 100644 --- a/nanovdb/nanovdb/util/WenoStencil.md +++ b/nanovdb/nanovdb/util/WenoStencil.md @@ -2,131 +2,143 @@ Design reference for `nanovdb/nanovdb/util/WenoStencil.h`. Captures the rationale behind the templated-on-lane-width class, the out-of-band -extrapolation algorithm, and the relationship to the broader Phase-2 -pipeline sketched in `BatchAccessor.md §11`. +extrapolation algorithm, the Godunov norm-square-gradient method, and the +relationship to the broader Phase-2 pipeline sketched in +`BatchAccessor.md §11`. --- ## 1. Motivation -The WENO5 CPU pipeline (`BatchAccessor.md §11`) produces a per-batch -matrix of 19 tap values per voxel: +The WENO5 CPU pipeline (`BatchAccessor.md §11`) assembles, per voxel +batch, a 19-tap value matrix with per-tap activity flags: ``` float values[Ntaps][W] -- real sidecar value, or background for OOB lanes bool isActive[Ntaps][W] -- true iff the tap voxel is in the narrow band ``` -The next pipeline phase consumes `values[tap][lane]` as 19 `Simd` -rows for the WENO5 reconstruction arithmetic. But before arithmetic, -out-of-band lanes must be repaired so that `values[tap][lane]` holds a -sensible float for every lane, not just the in-band ones. That repair -is the **extrapolation** step. +Downstream phases are (1) **extrapolation** to repair out-of-band lanes +with sign-corrected background, and (2) **WENO5 arithmetic** (the fifth- +order upwind Godunov norm-square-gradient) to produce a per-voxel +`|∇φ|²` scalar. -`WenoStencil` encapsulates: +`WenoStencil` encapsulates the compute state and both operations: -1. Storage for the 19-tap × W-lane data + activity flags. -2. The extrapolation algorithm (ascending-|Δ| cascade, hardcoded tap pairs). -3. A pattern that keeps GPU (W=1) code textually identical to CPU SIMD - (W>1) code — only the storage shapes and the Simd.h backend differ. +1. Storage of the 19-tap × W-lane Simd values + activity masks. +2. `extrapolate()` — ascending-|Δ| cascade, hardcoded tap pairs. +3. `normSqGrad()` — six axial WENO5 reconstructions + Godunov combinator. +4. A single-source pattern that keeps GPU (W=1, per-thread) code + textually identical to CPU SIMD (W>1) code — only the compile-time + lane width changes. --- ## 2. Single-source scalar/SIMD design -### 2.1 The conditional_t storage trick +### 2.1 `Simd` as the storage type + +Storage is `Simd` / `SimdMask` directly — *not* raw +`float[W]` / `bool[W]` arrays: ```cpp template class WenoStencil { public: - using ValueT = std::conditional_t; - using PredT = std::conditional_t; + using FloatV = util::Simd ; + using MaskV = util::SimdMask; + + FloatV values [SIZE]; + MaskV isActive[SIZE]; - alignas(64) ValueT mValues [SIZE]; - alignas(64) PredT mIsActive[SIZE]; - /* ... */ + float mDx2{1.f}; // dx² — scalar, broadcast on use + float mInvDx2{1.f}; // 1 / dx² — scalar, broadcast on use + /* ... extrapolate(), normSqGrad() ... */ }; ``` -| W | `mValues` expands to | `mIsActive` expands to | Intended use | -|---|---|---|---| -| 1 | `float mValues[19]` | `bool mIsActive[19]` | GPU thread-per-voxel — 19 scalar registers | -| 16 | `float mValues[19][16]` | `bool mIsActive[19][16]` | CPU SIMD batch — 19 YMM-tiles in L1 | - -Same declaration syntax, different expansion. Memory layout at W>1 is -identical to writing `float mValues[SIZE][W]` directly — no performance -difference, just a cleaner scalar case at W=1. - -### 2.2 Why the same source compiles to good scalar and SIMD code - -`extrapolate()`'s body uses only `nanovdb::util::Simd` primitives: - -- load / store (ctor + free `store`) -- `operator>` (produces `SimdMask`) -- unary `operator-` -- `where(mask, a, b)` — 3-arg blend - -All of these exist in Simd.h's scalar degenerate (`fixed_size<1>` in the -stdx backend, 1-element array in the array backend). At W=1 the -compiler inlines and collapses every operation to a plain scalar -instruction. **One source body, two target ISAs, no `if constexpr` -branches on W inside the algorithm.** - ---- - -## 3. The `addr()` bridge - -One asymmetry survives the `conditional_t` unification: at W=1, -`mValues[k]` is a scalar `float` value (not an array), so it does not -decay to `float*` — `Simd::load(mValues[k], flags)` wouldn't -type-check. At W>1, `mValues[k]` is a `float[W]` array and decays -naturally. - -A private `addr()` helper papers over this in one place: +At W=1 the Simd types collapse to plain `float` / `bool` under the +array backend. At W>1 they are the backend-native SIMD type (stdx or +array wrapper). Memory layout at W>1 is identical to `float[SIZE][W]` +directly, so there is no storage-cost penalty — just a cleaner +compute-side type. + +### 2.2 Why first-class Simd storage (vs raw C arrays + `addr()` bridge) + +An earlier version used `std::conditional_t` as +the element type, with an `addr()` helper to normalize the W=1 scalar- +reference vs W>1 array-decay at every SIMD load/store site. That +design was rejected in favour of Simd-typed storage for three reasons: + +- **W=1 ceremony**. Approach 1 forced the scalar case to read + `FloatV val(addr(mValues[k]), element_aligned)` — loading a + `Simd` from a scalar reference. On the CUDA per-thread + path (where the scalar case is the *production* pipeline, not a + degenerate convenience) this ceremony survives into every + `__hostdev__` method that reads the stencil. Under Simd-typed + storage, W=1 reads `FloatV val = values[k]` — pure scalar code. + +- **Arithmetic boundary ceremony**. Approach 1 made `extrapolate()` + and a prospective `normSqGrad()` bracket every read/write with an + explicit Simd load or store. With Simd-typed storage, the + arithmetic reads as if scalar (`values[k] = util::where(...)`) and + the load/store boundary moves out to the caller where the Simd + values meet raw fill-side buffers. + +- **Symmetric, explicit boundary placement**. The caller already owns + the fill-side scalar-scatter loop (sidecar `sidecar[idx]` gathers are + inherently scalar-indexed per lane). Making the array→Simd + conversion an explicit caller-side step (`FloatV(raw_values[k], + element_aligned)`) preserves that ownership — the arithmetic class + doesn't care where its data came from. + +### 2.3 Scalar runtime constants, broadcast on use + +`mDx2` and `mInvDx2` stay plain `float` at every W. They are +broadcast to `FloatV` at the point of use inside `normSqGrad()`: ```cpp -static constexpr float* addr(ValueT& v) noexcept { - if constexpr (W == 1) return &v; else return v; -} +return FloatV(mInvDx2) * detail::GodunovsNormSqrd(...); ``` -Callers (inside `extrapolate()`) always write: +`vbroadcastss` is free on x86 (folds into the FMA consumer); identity +at W=1. Storing these as `Simd` instead would cost 64 +bytes × 2 of storage and hold two YMM registers across the entire +kernel lifetime for no benefit. -```cpp -FloatV val(addr(mValues[k]), element_aligned); -``` +### 2.4 Caller-owned fill-side buffers -and get a uniform expression that works at any W. There are four -overloads (`ValueT&` / `const ValueT&` / `PredT&` / `const PredT&`); -all are `constexpr` and compile to a no-op at W>1 and a trivial address -fetch at W=1. +The class has **no** fill-side storage — no `mValues`/`mIsActive` raw C +arrays. Callers own whatever shape of raw data is natural for them. +For the CPU SIMD case that's typically a pair of stack-local +`alignas(64)` C arrays sized `[SIZE][W]`; for the CUDA per-thread case +no intermediate buffer is needed at all. -**Alternative considered (rejected):** overloading `Simd::load` -to accept `T&` at W=1. Blocked by the stdx backend's type-alias -representation (`using Simd = stdx::fixed_size_simd`) — we can't add -member ctors to an alias. The equivalent free-function workaround -turned out no shorter than the `addr()` helper and would have forced -Simd.h churn for a benefit scoped to one class. See the Stage-3 design -exchange for the full discussion. +This preserves the arithmetic class's purity and gives callers flex- +ibility — a different Phase-2 path (e.g. a halo-based fetch, or a +future hardware-gather fill) can populate the stencil using whatever +pattern fits, without the class having to expose a "fill API" that +bakes in one shape. --- -## 4. Extrapolation semantics +## 3. Extrapolation semantics -### 4.1 The out-of-band problem +### 3.1 The out-of-band problem For a narrow-band SDF, only the center tap `<0,0,0>` is guaranteed to be in the active narrow band. Every other tap may land outside the -band for some lanes of a batch — for those lanes, `idx == 0` in the -sidecar fill, so `values[k][lane] = sidecar[0] = |background|` and +band for some lanes — for those lanes the sidecar fill writes +`values[k][lane] = sidecar[0]` (magnitude of the background, since the +sidecar builder pre-sets slot 0 to `floatGrid.background()`) and `isActive[k][lane] = false`. -Applying WENO5 arithmetic directly to the `|background|` magnitude -produces wrong gradients at the band boundary: the reconstructed field -would not track the sign of the underlying signed distance function. -The standard fix is to **extrapolate from the next inner tap's sign**: +Applying WENO5 arithmetic directly to the unsigned `|background|` +magnitude produces wrong gradients at the band boundary: the +reconstructed field would not track the sign of the underlying signed- +distance function. The standard fix is to **extrapolate from the next +inner tap's sign**: ``` if (!isActive[k][lane]) @@ -135,9 +147,9 @@ if (!isActive[k][lane]) The `|background|` magnitude is preserved; the sign is copied from whichever "inner" tap (one step closer to center along the same axis) -best represents which side of the surface this lane belongs to. +best represents which side of the surface the lane belongs to. -### 4.2 Inner-tap cascade — ascending |Δ| order +### 3.2 Inner-tap cascade — ascending |Δ| order | Outer tap |Δ| | Inner tap (source of sign) | |---|---| @@ -146,18 +158,17 @@ best represents which side of the surface this lane belongs to. | `<±3,0,0>`, `<0,±3,0>`, `<0,0,±3>` | `<±2,0,0>`, `<0,±2,0>`, `<0,0,±2>` | Processing taps in ascending-|Δ| order guarantees the inner tap is -already resolved (real value or previously extrapolated) when the outer -tap is processed. Sign propagation through a |Δ|=1 → |Δ|=2 → |Δ|=3 -chain is automatic — no special casing. +already resolved (real value or previously extrapolated) when the +outer tap is processed. Sign propagation through a |Δ|=1 → |Δ|=2 → +|Δ|=3 chain is transitive — no special casing. -### 4.3 The `kPairs[]` table +### 3.3 The `kPairs[]` table The inner-tap relationship is `Weno5Stencil`-specific and hardcoded as a static table inside the class: ```cpp -static constexpr int kNumPairs = 18; -static constexpr int kPairs[kNumPairs][2] = { +static constexpr int kPairs[18][2] = { // |Δ|=1 (inner = center, idx 0) {3,0},{4,0},{9,0},{10,0},{15,0},{16,0}, // |Δ|=2 (inner = |Δ|=1 on same axis) @@ -168,147 +179,254 @@ static constexpr int kPairs[kNumPairs][2] = { ``` Indices match the tuple ordering in `Weno5Stencil::Taps` -(`StencilAccessor.h`). Center tap (idx 0) is not processed — assumed -always in-band. +(`StencilAccessor.h`) and `WenoPt::idx` in +`nanovdb/math/Stencils.h`. Center tap (idx 0) is not processed — +assumed always in-band. **Why hardcoded, not template-derived:** a generic scheme would walk `Weno5Stencil::Taps` at compile time and derive inner-tap indices from -|Δ| and axis alignment. For a single stencil (Weno5) this is -over-engineering: the table is 18 entries, reads directly, and makes -the cascade ordering self-documenting. Worth revisiting if we add -Weno7 or other axis-aligned WENO variants. - ---- +|Δ| and axis alignment. For a single stencil the table is 18 entries, +reads directly, and makes the cascade ordering self-documenting. +Worth revisiting if we add Weno7 or other axis-aligned WENO variants. -## 5. Extrapolate — implementation +### 3.4 `extrapolate()` implementation ```cpp template void WenoStencil::extrapolate(float absBackground) { - using FloatV = nanovdb::util::Simd ; - using MaskV = nanovdb::util::SimdMask; - const FloatV absBg(absBackground); - const FloatV zero (0.0f); + const FloatV zero (0.f); for (int p = 0; p < kNumPairs; ++p) { const int k = kPairs[p][0]; const int kInner = kPairs[p][1]; - const MaskV active(addr(mIsActive[k]), element_aligned); - const FloatV val (addr(mValues [k]), element_aligned); - const FloatV inner (addr(mValues [kInner]), element_aligned); - // copysign(absBg, inner): +absBg if inner >= 0, else -absBg. - const MaskV isNegInner = zero > inner; - const FloatV extrap = where(isNegInner, -absBg, absBg); + const MaskV isNegInner = zero > values[kInner]; + const FloatV extrap = util::where(isNegInner, -absBg, absBg); - // Active lanes keep `val`; inactive lanes take `extrap`. - const FloatV result = where(active, val, extrap); - store(result, addr(mValues[k]), element_aligned); + // Active lanes keep their own value; inactive take the extrapolated sign-corrected background. + values[k] = util::where(isActive[k], values[k], extrap); } } ``` +No `addr()`. No `element_aligned`. Reads `values[]` as Simd, +operates as Simd, writes Simd — the kernel body never drops to the +underlying scalar/array representation. + **Per-pair cost (W=16, AVX2):** | Op | Cycles (est.) | |----|--------------:| -| 3× load (mIsActive, mValues[k], mValues[kInner]) | 3 | -| `0 > inner` (vcmpltps + sign mask) | 1 | +| compare `zero > values[kInner]` | 1 | | `where(isNegInner, -absBg, absBg)` (vblendvps) | 1 | -| `where(active, val, extrap)` (mask convert + vblendvps) | 1–2 | -| 1× store (mValues[k]) | 1 | -| **≈ 7 cycles / pair** | +| `where(isActive[k], values[k], extrap)` (mask convert + vblendvps) | 1–2 | +| **≈ 4 cycles / pair** (values[] register-resident) | + +Total: 18 pairs × ~4 cycles = ~72 cycles per call — lower than the +Approach-1 estimate of ~7 cycles/pair, because we no longer do the +explicit per-pair load. The Simd values live in YMM registers across +the pair loop. Measured end-to-end cost (sidecar-stencil-extrap minus +sidecar-stencil): ~0.14–0.19 ns/voxel on 24 threads. -Total: 18 pairs × 7 cycles = ~126 cycles per call. Amortised over -W=16 lanes gives ~8 cycles/voxel, or ~2 ns/voxel on a 4 GHz core. +--- -Measured overhead in `sidecar-stencil-extrap` pass on taperLER.vdb -(24 threads): **+4.5 ms / 31.8M voxels = 0.14 ns/voxel** end-to-end — -lines up with the per-core estimate divided by thread count (24× -speedup ≈ 2 / 24 ≈ 0.083 ns; measurement includes framing overhead). +## 4. Godunov norm-square-gradient -**Skipping active lanes:** the algorithm reads and computes for every -lane regardless of `isActive`. For active lanes, `extrap` is computed -but then discarded by the final `where`. This wasted work is cheaper -than a predicated-store alternative because: +### 4.1 Semantics — tracking the ground-truth scalar -- The SIMD blend is one instruction (`vblendvps`). -- Per-lane branching would serialize the batch. -- Active-fraction is high (~90% on narrow-band SDFs), so masked - computation saves little even in the best case. +`nanovdb::math::WenoStencil::normSqGrad(isoValue)` in +`nanovdb/math/Stencils.h` is the ground-truth scalar reference. Its +body: ---- +```cpp +const ValueType* v = mValues; +const RealT + dP_xm = WENO5(v[2]-v[1], v[3]-v[2], v[0]-v[3], v[4]-v[0], v[5]-v[4], mDx2), + dP_xp = WENO5(v[6]-v[5], v[5]-v[4], v[4]-v[0], v[0]-v[3], v[3]-v[2], mDx2), + dP_ym = ..., dP_yp = ..., dP_zm = ..., dP_zp = ...; +return mInvDx2 * GodunovsNormSqrd(v[0] > isoValue, dP_xm, dP_xp, dP_ym, dP_yp, dP_zm, dP_zp); +``` + +`WenoStencil::normSqGrad(iso)` is a line-for-line transliteration of +the same body, with three adaptations: + +1. `v = values` (Simd-typed storage, not `mValues` scalar array). +2. Local `dP_*` are `FloatV` rather than `RealT`. +3. The final `mInvDx2 * ...` multiplication broadcasts the scalar + `mInvDx2` to `FloatV` (via `FloatV(mInvDx2)`); at W=1 this is a + no-op. + +### 4.2 `WENO5` — generic over scalar and Simd -## 6. API usage +The six axial reconstructions are driven by a single free-function +template `nanovdb::detail::WENO5` that mirrors +`nanovdb::math::WENO5` exactly (Shu ICASE +smoothness indicators, 0.1/0.6/0.3 linear weights, static_cast at the +end replaced by the trailing division). Structure is the same; only +the literal constants are wrapped in `RealT(...)` constructors to +broadcast at W>1. Lives in `nanovdb::detail` to keep the naming +convention close to the ground-truth without colliding with the +existing `nanovdb::math::WENO5`. -### 6.1 Filling the stencil from sidecar indices +### 4.3 `GodunovsNormSqrd` — `where`-based, no control flow + +The scalar ground-truth has a runtime `if (isOutside) { … } else { … }`. +The generic-T version computes both branches unconditionally and +blends via `util::where`: ```cpp -WenoStencil stencil; -StencilAccessor acc(grid, ...); -acc.moveTo(leafIndex + batchStart, voxelOffset + batchStart); - -for (int k = 0; k < WenoStencil::size(); ++k) { - for (int i = 0; i < SIMDw; ++i) { - const uint64_t idx = acc.mIndices[k][i]; - stencil.mValues [k][i] = sidecar[idx]; // sidecar[0] = background - stencil.mIsActive[k][i] = (idx != 0); - } +template +inline T GodunovsNormSqrd(MaskT isOutside, + T dP_xm, T dP_xp, T dP_ym, T dP_yp, T dP_zm, T dP_zp) +{ + const T zero(0.f); + const T outside = max(Pow2(max(dP_xm, zero)), Pow2(min(dP_xp, zero))) // (dP/dx)² + + max(Pow2(max(dP_ym, zero)), Pow2(min(dP_yp, zero))) + + max(Pow2(max(dP_zm, zero)), Pow2(min(dP_zp, zero))); + const T inside = max(Pow2(min(dP_xm, zero)), Pow2(max(dP_xp, zero))) + + max(Pow2(min(dP_ym, zero)), Pow2(max(dP_yp, zero))) + + max(Pow2(min(dP_zm, zero)), Pow2(max(dP_zp, zero))); + return where(isOutside, outside, inside); } ``` -At W=1 (GPU per-thread) the same body would just drop the `[i]` index: +At `T=float, MaskT=bool` this compiles to scalar code with both +branches speculatively evaluated — slightly slower than the +ground-truth's branchy form for scalar workloads in isolation, but +identical correctness. At `T=Simd, MaskT=SimdMask` +the branches are unconditional SIMD compute plus a `vblendvps` — +no lane-divergent branches, no scalarisation. + +Per-lane cost of the full `normSqGrad`: + +| Phase | Ops | +|-------|-----| +| 6× axial WENO5 | ~60 mul/add/fma + 6× reciprocals (the `0.N / Pow2(…)` terms) | +| Godunov: 12× max/min + 12× mul + 5× add, both branches | ~29 ops | +| Blend + final multiply by FloatV(mInvDx2) | 2 ops | + +Roughly ~100 arithmetic ops per voxel per `normSqGrad` call. At W=16 +AVX2 that's ~100 / 16 ≈ 6.3 cycles/voxel × some FMA-throughput factor +— call it 2 ns/voxel single-threaded, 0.1 ns/voxel on 24 threads. +(To be validated by measurement; see §7.1 Future work.) + +--- + +## 5. API usage + +### 5.1 CPU SIMD — caller-owned raw buffers, explicit load ```cpp -stencil.mValues [k] = sidecar[idx]; -stencil.mIsActive[k] = (idx != 0); +// Caller owns its scalar-scatter target. +alignas(64) float raw_values[SIZE][W]; +alignas(64) bool raw_active[SIZE][W]; + +nanovdb::WenoStencil stencil(dx); + +// Fill — pure scalar stores, guaranteed fast codegen on all backends. +for (int k = 0; k < SIZE; ++k) { + for (int i = 0; i < W; ++i) { + const uint64_t idx = /* sidecar index for tap k, lane i */; + raw_values[k][i] = sidecar[idx]; + raw_active[k][i] = (idx != 0); + } +} + +// Bridge — one SIMD load per tap. +for (int k = 0; k < SIZE; ++k) { + stencil.values [k] = FloatV(raw_values[k], util::element_aligned); + stencil.isActive[k] = MaskV (raw_active[k], util::element_aligned); +} + +// Arithmetic — reads/writes stencil.values[] as Simd in place. +stencil.extrapolate(std::abs(sidecar[0])); +FloatV normSq = stencil.normSqGrad(/* iso = */ 0.f); + +// Simd → scalar bridge at the output side if downstream consumers are scalar. +alignas(64) float normSq_lanes[W]; +util::store(normSq, normSq_lanes, util::element_aligned); ``` -### 6.2 Extrapolating +### 5.2 CUDA scalar — no intermediate buffer ```cpp -stencil.extrapolate(std::abs(floatGrid.background())); +nanovdb::WenoStencil<1> stencil(dx); +for (int k = 0; k < SIZE; ++k) { + const uint64_t idx = gather_index_for_tap(k); + stencil.values [k] = sidecar[idx]; + stencil.isActive[k] = (idx != 0); +} + +stencil.extrapolate(fabsf(sidecar[0])); +float normSq = stencil.normSqGrad(); ``` -After this call, every `stencil.mValues[k][i]` holds either the real -sidecar value (for active lanes) or a sign-corrected `|background|` -(for inactive lanes). `mIsActive[]` is no longer needed downstream. +`FloatV` is `float` at W=1; direct scalar assignment. `MaskV` is +`bool`. No raw buffers, no `element_aligned`, no load loops — the +per-thread path reads as pure scalar arithmetic. -### 6.3 Compile-time named-tap access +### 5.3 Compile-time named-tap access ```cpp -constexpr int ctr = WenoStencil::tapIndex<0, 0, 0>(); -float centerValue = stencil.mValues[ctr][i]; +constexpr int ctr = WenoStencil::tapIndex<0, 0, 0>(); +FloatV centerValue = stencil.values[ctr]; -constexpr int xm3 = WenoStencil::tapIndex<-3, 0, 0>(); -// ... etc, for WENO5 arithmetic +constexpr int xm3 = WenoStencil::tapIndex<-3, 0, 0>(); +FloatV xm3Value = stencil.values[xm3]; ``` `tapIndex()` forwards to `detail::findIndex` (shared with -`StencilAccessor`), static-asserting at compile time that the requested -tap exists in the Weno5Stencil::Taps tuple. +`StencilAccessor`), static-asserting at compile time that the +requested tap exists in the Weno5Stencil::Taps tuple. --- -## 7. Future work +## 6. Ownership boundaries -### 7.1 WENO5 reconstruction method +``` +┌───────────────────────────────────────────────────────────────────┐ +│ Caller │ +│ alignas(64) float raw_values[SIZE][W]; ← fill-side buffer │ +│ alignas(64) bool raw_active[SIZE][W]; │ +│ │ +│ │ +│ │ +│ for k: stencil.values[k] = FloatV(raw_values[k], ...); │ +│ stencil.isActive[k] = MaskV (raw_active[k], ...); │ +│ ═══════════════════════════════════════════════════ Simd border │ +├───────────────────────────────────────────────────────────────────┤ +│ WenoStencil │ +│ FloatV values [19]; MaskV isActive[19]; │ +│ float mDx2, mInvDx2; │ +│ extrapolate() / normSqGrad() — Simd-in / Simd-out, pure compute │ +├───────────────────────────────────────────────────────────────────┤ +│ Caller │ +│ ═══════════════════════════════════════════════════ Simd border │ +│ util::store(normSq, normSq_lanes, util::element_aligned); │ +│ │ +└───────────────────────────────────────────────────────────────────┘ +``` -The class's second substantive operation (not yet implemented) will be -the WENO5 arithmetic itself — a compile-time fold over the 19 tap -rows, producing three `Simd` fluxes (one per axis) from the -fully-resolved `mValues` matrix. Natural signature: +Array↔Simd bridges exist only at the two explicit boundaries where +scalar-indexed I/O meets SIMD-parallel compute. Inside `WenoStencil` +everything is Simd; outside the class the caller chooses whatever +scalar pattern fits its source/sink. -```cpp -struct Weno5Flux { FloatV dx, dy, dz; }; -Weno5Flux reconstruct() const; -``` +--- + +## 7. Future work + +### 7.1 Measurement — lock in the perf numbers -Adopting the same single-source structure: at W=1 the fluxes collapse -to scalars; at W>1 they are SIMD vectors. +Reconstruct()-path (normSqGrad) cost hasn't been measured yet. Next +step: add a `sidecar-stencil-normsqgrad` benchmark pass in +`ex_narrowband_stencil_cpu` to drive normSqGrad to completion on +taperLER.vdb; compare against `sidecar-stencil-extrap` (which writes +the tap-sum instead of normSqGrad) to isolate the Phase-3 arithmetic +cost. ### 7.2 Consolidate the Weno5Stencil policy @@ -317,13 +435,13 @@ Currently `Weno5Stencil` (the tap-tuple policy struct) lives in `using Taps = Weno5Stencil::Taps`. The policy is arguably a Weno-specific definition and could move into `WenoStencil.h`; `StencilAccessor.h` would then `#include <.../WenoStencil.h>` for the -policy. Left as-is for this pass to minimise Stage-3 churn. +policy. Left as-is to minimise churn across files. ### 7.3 Alternative stencils -If/when Weno7 or a non-axis-aligned stencil is needed, the class would -specialise on a stencil-policy template parameter rather than hardcode -`Weno5Stencil`: +If/when Weno7 or a non-axis-aligned stencil is needed, the class +would specialise on a stencil-policy template parameter rather than +hardcode `Weno5Stencil`: ```cpp template @@ -339,16 +457,21 @@ exists. ## 8. Relationship to other design docs -- **`BatchAccessor.md §11`** — the broader Phase-2 pipeline plan - (VBM decode → sidecar assembly → extrapolation → WENO arithmetic → - write-back). WenoStencil implements the "extrapolation" step and - provides the storage that carries data from "sidecar assembly" into - the future "WENO arithmetic" step. +- **`BatchAccessor.md §11`** — the broader Phase-2/3 pipeline plan + (VBM decode → sidecar assembly → extrapolation → WENO arithmetic + → write-back). `WenoStencil` implements the extrapolation and + (now) the WENO arithmetic steps; the storage carries data across + from sidecar-assembly. - **`StencilAccessor.md`** — Phase-1 accessor (batched uint64 index - gather). StencilAccessor fills `mIndices[SIZE][W]`; WenoStencil - consumes those indices (via `sidecar[idx]` in user code) and owns - the per-lane float result. + gather). `StencilAccessor` fills `mIndices[SIZE][W]`; callers + consume those indices (via `sidecar[idx]` in their fill loops) and + populate `WenoStencil::values[]` / `isActive[]`. - **`HaloStencilAccessor.md`** — speculative alternative that precomputes a dense float halo buffer; if that path is pursued, - WenoStencil would fill from the halo instead of from sidecar - indices. The extrapolation algorithm here transfers unchanged. + `WenoStencil` would fill from the halo instead of from sidecar + indices. The extrapolation and normSqGrad algorithms here transfer + unchanged. +- **`nanovdb/math/Stencils.h`** — the scalar ground-truth for WENO5 + and Godunov. `WenoStencil::normSqGrad()` is a line-for-line + transliteration of `nanovdb::math::WenoStencil::normSqGrad()` + to generic-T form. From c00fe651b660b62b25c44b1d415d3781d85e7c89 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Mon, 20 Apr 2026 23:10:22 -0500 Subject: [PATCH 48/60] ex_narrowband_stencil_cpu: add sidecar-stencil-normsqgrad full-pipeline pass MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the full Phase-2+3 WENO5 pipeline as a new benchmark pass: decode → sidecar fill → load into WenoStencil → extrapolate → normSqGrad → store |∇φ|² to output sidecar. The pass replaces the debug "sum over 19 taps" intermediate of sidecar-stencil-extrap with the real Phase-3 arithmetic — what goes into the output channel is now the actual norm-square-of-gradient, not an anti-DCE token. Existing sidecar-stencil-extrap kept as a perf-decomposition waypoint that isolates the cost of load+extrapolate alone. Grid voxel size comes from grid->voxelSize()[0] (isotropic assumption for narrow-band level sets); iso = 0 for zero-crossing semantics. Measurements on taperLER.vdb (31.8M active voxels, 24 threads, i9-285K): Sidecar (stencil) : 83.6 ms (2.6 ns/voxel) fill + debug sum Sidecar (stencil+extrap) : 78.5 ms (2.5 ns/voxel) + extrapolate Sidecar (+normSqGrad) : 97.3 ms (3.1 ns/voxel) + full WENO5 + Godunov Phase-3 arithmetic cost isolated: 18.8 ms / 31.8M vox = 0.59 ns/voxel end-to-end on 24 threads. Per-core per-voxel: ~14 ns, roughly matching the expected ~100-FMA WENO5+Godunov budget at AVX2 throughput. Full Phase-2+3 end-to-end over decode: 90.7 ms = 2.85 ns/voxel. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Efty Sifakis --- .../narrowband_stencil_cpu.cpp | 103 +++++++++++++++++- 1 file changed, 101 insertions(+), 2 deletions(-) diff --git a/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp b/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp index 6ccef74178..96ffe4acd6 100644 --- a/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp +++ b/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp @@ -368,7 +368,8 @@ static void runPerf( // wantPass() returns true if this pass should run under the current filter. // Supported names: "decode", "stencil", "framing", "legacy", // "sidecar-legacy", "sidecar-stencil", "sidecar-stencil-extrap", - // "sidecar-transposed", "legacy-transposed". + // "sidecar-stencil-normsqgrad", "sidecar-transposed", + // "legacy-transposed". // "all" runs everything. auto wantPass = [&](const char* name) { return passFilter == "all" || passFilter == name; @@ -825,6 +826,100 @@ static void runPerf( }); } // end wantPass("sidecar-stencil-extrap") + // ---- sidecar-stencil-normsqgrad: full Phase-2+3 pipeline ---- + // load → extrapolate → normSqGrad → store. Same Phase-2 front end as + // sidecar-stencil-extrap, but the 19-tap token sum is replaced by the + // real Phase-3 arithmetic: Godunov's fifth-order WENO upwind + // norm-square gradient. The per-voxel `|∇φ|²` goes straight into the + // output sidecar — no debug intermediate. + // + // Grid voxel size from grid->voxelSize()[0] (isotropic assumption for + // narrow-band SDFs). iso = 0 (zero-crossing is the surface). + double sidecarStencilNormSqGradUs = 0.0; + uint64_t sidecarStencilNormSqGradChecksum = 0; + if (wantPass("sidecar-stencil-normsqgrad")) { + std::fill(outputSidecar.begin(), outputSidecar.end(), 0.f); + + const float absBackground = std::abs(sidecar[0]); + const float dx = float(grid->voxelSize()[0]); + + sidecarStencilNormSqGradUs = timeForEach([&] { + nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), + [&](const nanovdb::util::Range1D& range) { + alignas(64) uint32_t leafIndex[BlockWidth]; + alignas(64) uint16_t voxelOffset[BlockWidth]; + + alignas(64) float raw_values[nanovdb::WenoStencil::size()][SIMDw]; + alignas(64) bool raw_active[nanovdb::WenoStencil::size()][SIMDw]; + + nanovdb::WenoStencil stencil(dx); + constexpr int SIZE = nanovdb::WenoStencil::size(); + using FloatV = nanovdb::util::Simd ; + using MaskV = nanovdb::util::SimdMask; + + const float* const scIn = sidecar.data(); + float* const scOut = outputSidecar.data(); + + for (size_t bID = range.begin(); bID != range.end(); ++bID) { + CPUVBM::decodeInverseMaps( + grid, firstLeafID[bID], + &jumpMap[bID * CPUVBM::JumpMapLength], + firstOffset + bID * BlockWidth, + leafIndex, voxelOffset); + + int nExtraLeaves = 0; + for (int w = 0; w < CPUVBM::JumpMapLength; ++w) + nExtraLeaves += nanovdb::util::countOn( + jumpMap[bID * CPUVBM::JumpMapLength + w]); + + SAccT stencilAcc(*grid, firstLeafID[bID], (uint32_t)nExtraLeaves); + const uint64_t blockBase = + firstOffset + (uint64_t)bID * BlockWidth; + + for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { + stencilAcc.moveTo(leafIndex + batchStart, voxelOffset + batchStart); + + // Fill — scalar scatter from sidecar into caller-owned raw C arrays. + for (int k = 0; k < SIZE; ++k) { + for (int i = 0; i < SIMDw; ++i) { + const uint64_t idx = stencilAcc.mIndices[k][i]; + raw_values[k][i] = scIn[idx]; + raw_active[k][i] = (idx != 0); + } + } + + // Load — per-tap SIMD load into stencil's compute view. + for (int k = 0; k < SIZE; ++k) { + stencil.values [k] = FloatV(raw_values[k], nanovdb::util::element_aligned); + stencil.isActive[k] = MaskV (raw_active[k], nanovdb::util::element_aligned); + } + + // Phase-3 arithmetic (in-place on stencil.values[], then reduce). + stencil.extrapolate(absBackground); + const FloatV result = stencil.normSqGrad(/* iso = */ 0.f); + + // Simd → scalar bridge; per-lane scalar write to output sidecar. + alignas(64) float result_lanes[SIMDw]; + nanovdb::util::store(result, result_lanes, nanovdb::util::element_aligned); + for (int i = 0; i < SIMDw; ++i) { + const int p = batchStart + i; + if (leafIndex[p] == CPUVBM::UnusedLeafIndex) continue; + scOut[blockBase + p] = result_lanes[i]; + } + } + } + }); + }); + + sidecarStencilNormSqGradChecksum = + std::accumulate(outputSidecar.begin(), outputSidecar.end(), uint64_t(0), + [](uint64_t a, float b) { + uint32_t bits; + std::memcpy(&bits, &b, sizeof(bits)); + return a ^ uint64_t(bits); + }); + } // end wantPass("sidecar-stencil-normsqgrad") + // ---- sidecar-transposed: tap-outer fill via direct ReadAccessor ---- // Mirrors `legacy-transposed`'s loop structure, but instead of summing // uint64 indices into a per-voxel accumulator, the tap-outer loop fills @@ -1027,6 +1122,9 @@ static void runPerf( std::printf(" Sidecar (stencil+extrap): %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", sidecarStencilExtrapUs / 1e3, sidecarStencilExtrapUs * 1e3 / double(nVoxels), (sidecarStencilExtrapUs - decodeUs) / 1e3, sidecarStencilExtrapChecksum); + std::printf(" Sidecar (+normSqGrad) : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", + sidecarStencilNormSqGradUs / 1e3, sidecarStencilNormSqGradUs * 1e3 / double(nVoxels), + (sidecarStencilNormSqGradUs - decodeUs) / 1e3, sidecarStencilNormSqGradChecksum); std::printf(" Sidecar (transposed) : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", sidecarXposedUs / 1e3, sidecarXposedUs * 1e3 / double(nVoxels), (sidecarXposedUs - decodeUs) / 1e3, sidecarXposedChecksum); @@ -1063,7 +1161,8 @@ static void printUsage(const char* argv0) << " all (default), verify, decode, stencil,\n" << " framing, legacy, legacy-transposed,\n" << " sidecar-legacy, sidecar-stencil,\n" - << " sidecar-stencil-extrap, sidecar-transposed\n" + << " sidecar-stencil-extrap,\n" + << " sidecar-stencil-normsqgrad, sidecar-transposed\n" << " --threads= Limit TBB parallelism (0 = TBB default)\n" << " --skip-validation Skip the sidecar ordering sanity check\n"; } From 1aa985b5934344bf43663bd786bd48d3f4783c12 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Mon, 20 Apr 2026 23:41:53 -0500 Subject: [PATCH 49/60] Simd + WenoStencil: [[gnu::always_inline]] on tiny wrappers + Phase-3 arithmetic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Assembly analysis of the sidecar-stencil-normsqgrad pass revealed that GCC was outlining tiny one-instruction wrappers despite their `inline` declaration. The cost model flagged them "too heavy" because the calling convention passes 256-bit YMM arguments by value, regardless of how trivial the body actually is. Result: - util::min / util::max / util::where on Simd emitted as weak symbols with 6-register prologue/epilogue + vzeroupper per call. GodunovsNormSqrd made 20 such calls per voxel batch. - detail::WENO5 and detail::GodunovsNormSqrd also emitted as weak out-of-line, called 6+1 times from WenoStencil<16>::normSqGrad. - WenoStencil<16>::normSqGrad and ::extrapolate also out-of-line, called from the pass lambda body. Each outlining layer forced a vzeroupper (AVX→SSE state transition required by the x86 ABI when the callee is not marked VZEROUPPER-aware). Per 16-voxel batch the normSqGrad path incurred ~26 vzeroupper + 8 function calls; over 2M batches × 24 threads this dominated what should have been a tight inlined SIMD block. Fix: [[gnu::always_inline]] on six sites. Simd.h — the stdx-backend wrappers at namespace nanovdb::util: - min(Simd, Simd) - max(Simd, Simd) - where(SimdMask, Simd, Simd) (homogeneous) - where(SimdMask, Simd, Simd) (heterogeneous) WenoStencil.h: - detail::WENO5 - detail::GodunovsNormSqrd - WenoStencil::extrapolate - WenoStencil::normSqGrad With the attribute applied, all four weak symbols (WENO5, Godunov, extrapolate, normSqGrad) disappear from the object file. The pass #8 (sidecar-stencil-normsqgrad) body grows from 1693 B to 9269 B as the full Phase-2+3 pipeline collapses into one inlined block. Hot-path instruction mix (stdx backend, -O3 -march=native, AVX2+FMA3, Arrow Lake): 518 packed SIMD arithmetic ops: 218 vmulps + 128 vaddps + 72 vsubps + 48 vdivps + 24 vmaxps 160 FMAs: 71 vfmadd231ps + 36 vfmadd132ps + 29 vfnmadd132ps + 24 vfnmadd231ps 28 vbroadcastss (scalar constants to YMM) 1160 / 2044 instructions use ymm (57 %) 3 calls remaining: decodeInverseMaps, StencilAccessor::moveTo, __stack_chk_fail. 3 vzeroupper total (down from ~26 scattered across the outlined call tree). Measured on taperLER.vdb (31.8 M active voxels, 24 threads, i9-285K): Before: Sidecar (+normSqGrad) = 97.3 ms = 3.1 ns/voxel After: Sidecar (+normSqGrad) = 82.4 ms = 2.6 ns/voxel Delta: −15 % end-to-end. Isolating the Phase-3 arithmetic alone (normSqGrad pass minus sidecar-stencil-extrap pass, which stops at the extrap step): Before: 18.8 ms / 31.8 M vox = 0.59 ns/voxel After: 2.7 ms / 31.8 M vox = 0.09 ns/voxel Delta: −86 %, i.e. 7× faster on the WENO5 + Godunov arithmetic. Checksum changed 0x4371e374 → 0x438f725f. Expected: with the helpers inlined at a single source location, GCC fuses more mul+add pairs into FMAs, giving slightly different FP-rounding behaviour. Both outputs are equally correct; the difference is non-deterministic FP-arithmetic artifacts of instruction scheduling, not a semantic change. No regressions elsewhere: every other pass (StencilAccessor, Legacy, sidecar-{legacy,stencil,transposed}, legacy-transposed) stayed within ~2 % of its previous timing on this run. Background: this is the analogue for Phase-3 of the fix documented in BatchAccessor.md §8h for the StencilAccessor::moveTo path, where [[gnu::flatten]] on the moveTo() call site served the same purpose (force-inlining Simd.h helpers that GCC outlines despite `inline`). Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Efty Sifakis --- nanovdb/nanovdb/util/Simd.h | 14 ++++++++++---- nanovdb/nanovdb/util/WenoStencil.h | 25 ++++++++++++++----------- 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/nanovdb/nanovdb/util/Simd.h b/nanovdb/nanovdb/util/Simd.h index 1bc3afc2d6..af88a6028a 100644 --- a/nanovdb/nanovdb/util/Simd.h +++ b/nanovdb/nanovdb/util/Simd.h @@ -68,16 +68,22 @@ using SimdMask = stdx::fixed_size_simd_mask; template using Simd = stdx::fixed_size_simd; +// [[gnu::always_inline]] forces these thin wrappers to inline at every +// call site. Without it, GCC's cost model sometimes outlines them — +// each call then pays a function-call + vzeroupper + register-ABI +// transition that dominates the one-instruction body (vminps / vmaxps / +// vblendvps). See BatchAccessor.md §8h for the analogous fix on the +// StencilAccessor path. template -inline Simd min(Simd a, Simd b) { return stdx::min(a, b); } +[[gnu::always_inline]] inline Simd min(Simd a, Simd b) { return stdx::min(a, b); } template -inline Simd max(Simd a, Simd b) { return stdx::max(a, b); } +[[gnu::always_inline]] inline Simd max(Simd a, Simd b) { return stdx::max(a, b); } // TS v2 where(mask, v) is a masked assignment proxy, not a 3-arg select. // Wrap it into the select(mask, a, b) form our kernels expect. template -inline Simd where(SimdMask mask, Simd a, Simd b) { +[[gnu::always_inline]] inline Simd where(SimdMask mask, Simd a, Simd b) { auto result = b; stdx::where(mask, result) = a; return result; @@ -85,7 +91,7 @@ inline Simd where(SimdMask mask, Simd a, Simd b) { // Heterogeneous where: mask element type U ≠ value element type T. // Converts the U-mask to a T-mask via a boolean round-trip. template -inline Simd where(SimdMask mask, Simd a, Simd b) { +[[gnu::always_inline]] inline Simd where(SimdMask mask, Simd a, Simd b) { bool arr[W]; for (int i = 0; i < W; i++) arr[i] = static_cast(mask[i]); SimdMask tmask(arr, element_aligned); diff --git a/nanovdb/nanovdb/util/WenoStencil.h b/nanovdb/nanovdb/util/WenoStencil.h index f1aba71545..410f5ebfad 100644 --- a/nanovdb/nanovdb/util/WenoStencil.h +++ b/nanovdb/nanovdb/util/WenoStencil.h @@ -64,9 +64,10 @@ namespace detail { // numerical epsilon; kept as a plain float for broadcast-on-demand. // --------------------------------------------------------------------------- template -NANOVDB_SIMD_HOSTDEV inline T WENO5(const T& v1, const T& v2, const T& v3, - const T& v4, const T& v5, - float scale2 = 1.f) +[[gnu::always_inline]] NANOVDB_SIMD_HOSTDEV inline T +WENO5(const T& v1, const T& v2, const T& v3, + const T& v4, const T& v5, + float scale2 = 1.f) { const RealT C = RealT(13.f / 12.f); const RealT eps = RealT(1.e-6f * scale2); @@ -97,10 +98,11 @@ NANOVDB_SIMD_HOSTDEV inline T WENO5(const T& v1, const T& v2, const T& v3, // degenerates this to the same semantics as the if/else. // --------------------------------------------------------------------------- template -NANOVDB_SIMD_HOSTDEV inline T GodunovsNormSqrd(MaskT isOutside, - T dP_xm, T dP_xp, - T dP_ym, T dP_yp, - T dP_zm, T dP_zp) +[[gnu::always_inline]] NANOVDB_SIMD_HOSTDEV inline T +GodunovsNormSqrd(MaskT isOutside, + T dP_xm, T dP_xp, + T dP_ym, T dP_yp, + T dP_zm, T dP_zp) { using util::min; using util::max; using util::where; const T zero(0.f); @@ -179,7 +181,7 @@ class WenoStencil // // Requires absBackground ≥ 0. // ------------------------------------------------------------------ - NANOVDB_SIMD_HOSTDEV void extrapolate(float absBackground); + [[gnu::always_inline]] NANOVDB_SIMD_HOSTDEV inline void extrapolate(float absBackground); // ------------------------------------------------------------------ // normSqGrad — Godunov's norm-square of the fifth-order WENO upwind @@ -195,7 +197,7 @@ class WenoStencil // normSqGrad after extrapolate is the typical pipeline shape, but the // method itself does not require extrapolate to have been called. // ------------------------------------------------------------------ - NANOVDB_SIMD_HOSTDEV FloatV normSqGrad(float iso = 0.f) const; + [[gnu::always_inline]] NANOVDB_SIMD_HOSTDEV inline FloatV normSqGrad(float iso = 0.f) const; private: // Hardcoded (tap, innerTap) pairs for Weno5Stencil::Taps, ordered by @@ -231,7 +233,8 @@ class WenoStencil // and W>1 (native SIMD width). // --------------------------------------------------------------------------- template -NANOVDB_SIMD_HOSTDEV void WenoStencil::extrapolate(float absBackground) +[[gnu::always_inline]] NANOVDB_SIMD_HOSTDEV inline void +WenoStencil::extrapolate(float absBackground) { const FloatV absBg(absBackground); const FloatV zero (0.f); @@ -259,7 +262,7 @@ NANOVDB_SIMD_HOSTDEV void WenoStencil::extrapolate(float absBackground) // combinator only (free on x86; identity at W=1). // --------------------------------------------------------------------------- template -NANOVDB_SIMD_HOSTDEV typename WenoStencil::FloatV +[[gnu::always_inline]] NANOVDB_SIMD_HOSTDEV inline typename WenoStencil::FloatV WenoStencil::normSqGrad(float iso) const { const FloatV* v = values; From 86234832ee37429a7b4a340d8d51dc02ad227778 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Tue, 21 Apr 2026 00:21:47 -0500 Subject: [PATCH 50/60] ex_weno_nanovdb_cpu: full Phase-2+3 pipeline with scalar-reference validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New standalone example that exercises the complete CPU WENO5 pipeline on a narrow-band level set loaded from a .vdb file and validates the result against a scalar reference: reference : per-voxel scalar nanovdb::math::WenoStencil> normSqGrad, driven by a TBB-parallel leaf walk. Tile values at internal nodes and in-leaf inactive values are preserved verbatim during the openvdb::FloatGrid -> NanoGrid conversion, so the stencil's ReadAccessor fetches correctly- signed values for taps outside the narrow band (the same convention OpenVDB's narrow-band level-set builder sets up, which matches the semantics of our explicit extrapolate()). fast : VBM decode -> LegacyStencilAccessor scalar gather -> per-tap SIMD load into WenoStencil -> extrapolate() -> normSqGrad() -> Simd->scalar bridge -> per-voxel scalar write to outputFast. No hybrid SIMD StencilAccessor; voxel-outer LegacyStencilAccessor chosen for code clarity. Cross-path indexing: both passes write to a sidecar-shaped output buffer keyed by ValueOnIndex slot (slot 0 = background, slots 1..N = active voxels in NanoVDB ordering). The index grid's accessor resolves `ijk -> slot` in the reference pass; VBM's firstOffset + bID*BlockWidth + lane gives the same slot in the fast pass, by VBM construction. After both passes, prints a log-decade histogram of |outputRef[i] - outputFast[i]| across all active voxels, plus max/mean deltas. On taperLER.vdb (31.8M active voxels, 24 threads, i9-285K): reference (scalar): 120.4 ms (3.80 ns/voxel) fast (VBM+SIMD) : 91.3 ms (2.87 ns/voxel) speedup: 1.3x |Delta| histogram: [0, 1e-10) : 23,431,571 (73.75%) bit-exact match [1e-10, 1e-9 ) : 2 ( 0.00%) [1e-9, 1e-8 ) : 2 ( 0.00%) [1e-8, 1e-7 ) : 1,122,483 ( 3.53%) single-ULP-class noise [1e-7, 1e-6 ) : 7,218,133 (22.72%) FMA-fusion-class noise [1e-6, inf ) : 0 ( 0.00%) max |Delta| = 9.5e-7 (at slot 12,390,855: ref=2.28233, fast=2.28234) mean |Delta| = 4.1e-8 No voxels disagree above 1e-6. Our explicit extrapolate() plus SIMD normSqGrad reproduces the scalar OpenVDB/NanoVDB tile-extrapolation ground truth to within FP-rounding tolerance — the sign-cascade rule (kPairs table in WenoStencil.h) is consistent with OpenVDB's topological tile-value convention on typical narrow-band SDFs. End-to-end 1.3x speedup reflects that both paths pay a roughly equal sidecar-gather cost; the 7x Phase-3-arithmetic speedup documented in BatchAccessor.md §11.6 shows up as only a fraction of total time because gather dominates (60% of fast-path wall clock) and is still scalar-scatter-shaped in both paths. A SIMD-gather front-end (the hybrid StencilAccessor path) would widen this, at the cost of the library complexity the clean demonstrator deliberately avoids. Library dependencies exercised by the fast path: - LegacyStencilAccessor.h (per-voxel scalar gather) - WenoStencil.h (Simd compute container) - Simd.h ([[gnu::always_inline]] helpers) - branchless LeafData::getValue (default body of getValue) - Weno5Stencil::Taps (tap tuple; currently in StencilAccessor.h) Not exercised — candidates for demotion or tighter doc-scoping: - StencilAccessor.h + BatchAccessor.h (hybrid SIMD gather path) - HaloStencilAccessor.md (speculative halo gather) CMakeLists.txt hook mirrors ex_narrowband_stencil_cpu: OPENVDB-gated, -march=native -fopenmp-simd, no CUDA dependency. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Efty Sifakis --- nanovdb/nanovdb/examples/CMakeLists.txt | 10 + .../.#weno_nanovdb_cpu.cpp | 1 + .../ex_weno_nanovdb_cpu/weno_nanovdb_cpu.cpp | 537 ++++++++++++++++++ 3 files changed, 548 insertions(+) create mode 120000 nanovdb/nanovdb/examples/ex_weno_nanovdb_cpu/.#weno_nanovdb_cpu.cpp create mode 100644 nanovdb/nanovdb/examples/ex_weno_nanovdb_cpu/weno_nanovdb_cpu.cpp diff --git a/nanovdb/nanovdb/examples/CMakeLists.txt b/nanovdb/nanovdb/examples/CMakeLists.txt index 37b6167d20..8fe727f696 100644 --- a/nanovdb/nanovdb/examples/CMakeLists.txt +++ b/nanovdb/nanovdb/examples/CMakeLists.txt @@ -141,6 +141,16 @@ if(TARGET ex_narrowband_stencil_cpu) ${CMAKE_CURRENT_SOURCE_DIR}/../../..) endif() +# End-to-end CPU WENO5 norm-square-gradient on a narrow-band level set, +# with a scalar reference for correctness validation. +# (See BatchAccessor.md §11 for the full Phase-2+3 pipeline this demonstrates.) +nanovdb_example(NAME "ex_weno_nanovdb_cpu" OPENVDB) +if(TARGET ex_weno_nanovdb_cpu) + target_compile_options(ex_weno_nanovdb_cpu PRIVATE -march=native -fopenmp-simd) + target_include_directories(ex_weno_nanovdb_cpu PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/../../..) +endif() + if(CUDAToolkit_FOUND) nanovdb_example(NAME "ex_make_mgpu_nanovdb") # requires cuRAND target_link_libraries(ex_make_mgpu_nanovdb PRIVATE CUDA::curand) diff --git a/nanovdb/nanovdb/examples/ex_weno_nanovdb_cpu/.#weno_nanovdb_cpu.cpp b/nanovdb/nanovdb/examples/ex_weno_nanovdb_cpu/.#weno_nanovdb_cpu.cpp new file mode 120000 index 0000000000..afd2029cf8 --- /dev/null +++ b/nanovdb/nanovdb/examples/ex_weno_nanovdb_cpu/.#weno_nanovdb_cpu.cpp @@ -0,0 +1 @@ +esifakis@esifakis-fct2250.31393:1776698410 \ No newline at end of file diff --git a/nanovdb/nanovdb/examples/ex_weno_nanovdb_cpu/weno_nanovdb_cpu.cpp b/nanovdb/nanovdb/examples/ex_weno_nanovdb_cpu/weno_nanovdb_cpu.cpp new file mode 100644 index 0000000000..bdd1022b36 --- /dev/null +++ b/nanovdb/nanovdb/examples/ex_weno_nanovdb_cpu/weno_nanovdb_cpu.cpp @@ -0,0 +1,537 @@ +// Copyright Contributors to the OpenVDB Project +// SPDX-License-Identifier: Apache-2.0 + +/*! + \file weno_nanovdb_cpu.cpp + + \brief End-to-end CPU WENO5 norm-square-gradient on a narrow-band level + set, with a scalar reference for correctness validation. + + Demonstrates the full Phase-2+3 pipeline that BatchAccessor.md §11 has + been leading up to: + + VBM decode -> per-batch sidecar value assembly -> out-of-band + sign-extrapolation -> SIMD Godunov WENO5 -> per-voxel |grad phi|^2 + output sidecar. + + Two passes run over the same .vdb input: + + reference : per-voxel scalar nanovdb::math::WenoStencil>::normSqGrad. + Tile values and in-leaf inactive values preserved through the + OpenVDB -> NanoVDB conversion carry correctly-signed + extrapolation "for free", matching our explicit extrapolate() + semantics on in-the-band-typical topology. + + fast : LegacyStencilAccessor gather -> WenoStencil load -> + extrapolate() -> normSqGrad() -> per-lane scalar store. + No hybrid SIMD StencilAccessor; voxel-outer Legacy path + for code clarity. + + Both passes write to the same-shape output buffer, keyed by ValueOnIndex + slot; a histogram of |outputRef - outputFast| follows. + + Usage: + weno_nanovdb_cpu [--grid=] + [--threads=] + [--skip-validation] + + Build: + Configured via CMakeLists.txt in the parent examples/ directory. + Requires OpenVDB (for .vdb IO). No CUDA. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include // scalar reference WenoStencil + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +// ============================================================ +// Constants and type aliases +// ============================================================ + +static constexpr int Log2BlockWidth = 7; +static constexpr int BlockWidth = 1 << Log2BlockWidth; // 128 +static constexpr int SIMDw = 16; // float lane width + +using BuildT = nanovdb::ValueOnIndex; +using IndexGridT = nanovdb::NanoGrid; +using LeafT = nanovdb::NanoLeaf; +using FloatGridT = nanovdb::NanoGrid; +using CPUVBM = nanovdb::tools::VoxelBlockManager; + +using LegacyAccT = nanovdb::LegacyStencilAccessor; + +// ============================================================ +// VDB loading and NanoVDB conversion +// ============================================================ + +static openvdb::FloatGrid::Ptr +loadFloatGridFromVdb(const std::string& path, const std::string& gridName) +{ + openvdb::io::File file(path); + file.open(false); // delayed loading off + + openvdb::GridBase::Ptr base; + if (!gridName.empty()) { + if (!file.hasGrid(gridName)) + throw std::runtime_error("no grid named \"" + gridName + "\" in " + path); + base = file.readGrid(gridName); + } else { + openvdb::GridPtrVecPtr grids = file.getGrids(); + for (auto& g : *grids) { + if (g && g->isType()) { base = g; break; } + } + if (!base) throw std::runtime_error("no openvdb::FloatGrid found in " + path); + } + file.close(); + + auto floatGrid = openvdb::gridPtrCast(base); + if (!floatGrid) throw std::runtime_error("grid is not an openvdb::FloatGrid"); + return floatGrid; +} + +/// NanoVDB conversion products shared across the two passes. +/// - floatHandle : NanoGrid — tile values + in-leaf inactive values +/// preserved verbatim, used by the scalar reference stencil. +/// - indexHandle : NanoGrid — the topology-only index grid. +/// - sidecar : float sidecar (slot 0 = background, slots 1..N = active +/// voxel values in NanoVDB indexing order). +struct ConvertedGrids { + nanovdb::GridHandle floatHandle; + nanovdb::GridHandle indexHandle; + std::vector sidecar; +}; + +static ConvertedGrids +convertFloatGrid(openvdb::FloatGrid& floatGrid) +{ + ConvertedGrids out; + + // Direct OpenVDB -> NanoVDB float conversion. No flags needed: + // - tile values at internal nodes are stored directly (mTable); + // - in-leaf inactive voxels share storage with active ones (dense 8^3). + // Both are preserved verbatim from the source grid. + out.floatHandle = nanovdb::tools::createNanoGrid(floatGrid); + + // Sidecar pipeline: build ValueOnIndex + float sidecar via the CreateNanoGrid builder. + nanovdb::tools::CreateNanoGrid builder(floatGrid); + out.indexHandle = builder.template getHandle< + nanovdb::ValueOnIndex, nanovdb::HostBuffer>( + /*channels =*/ 0u, + /*incStats =*/ false, + /*incTiles =*/ false); + out.sidecar.resize(builder.valueCount()); + builder.template copyValues(out.sidecar.data()); + + // NanoVDB convention: slot 0 is the "not-found / background" sentinel. + // copyValues doesn't touch it. Set it to the grid's background so the + // fast path can gather unconditionally (no per-lane branch on idx==0 during fill). + if (!out.sidecar.empty()) out.sidecar[0] = floatGrid.background(); + + return out; +} + +// ============================================================ +// Reference pass — scalar WenoStencil per active voxel +// ============================================================ +// +// Uses nanovdb::math::WenoStencil>. Its moveTo(ijk) +// populates 19 taps via the float grid's accessor; taps outside the +// narrow band hit either in-leaf inactive slots (stored +-background by +// the OpenVDB narrow-band builder) or tile values at internal nodes +// (same convention). Both cases yield correctly-signed extrapolated +// values, matching the semantics of our explicit extrapolate(). +// +// Cross-path indexing: both outputRef and outputFast are indexed by +// the ValueOnIndex slot of a voxel. For each active ijk we compute +// normSqGrad on the float grid, then resolve its output slot via the +// index grid's accessor. + +static double +runReference(const FloatGridT& floatGrid, + const IndexGridT& indexGrid, + std::vector& outputRef) +{ + std::fill(outputRef.begin(), outputRef.end(), 0.f); + + const uint32_t nLeaves = indexGrid.tree().nodeCount(0); + + std::ostringstream sink; + nanovdb::util::Timer timer; + auto timeIt = [&](auto&& body) -> double { + timer.start("", sink); + body(); + return static_cast(timer.elapsed()); + }; + + return timeIt([&] { + nanovdb::util::forEach(uint32_t(0), nLeaves, uint32_t(1), + [&](const nanovdb::util::Range1D& range) { + // One scalar stencil + one index accessor per TBB task. + nanovdb::math::WenoStencil stencil(floatGrid); + auto indexAcc = indexGrid.getAccessor(); + + const auto* firstFloatLeaf = + floatGrid.tree().template getFirstNode<0>(); + const auto* firstIndexLeaf = + indexGrid.tree().template getFirstNode<0>(); + + for (uint32_t lid = range.begin(); lid != range.end(); ++lid) { + // The two grids share topology, so leaf LID in the + // index grid aligns with leaf LID in the float grid + // (same order of insertion). Iterate the index grid's + // active voxels — those are the slots we need to fill. + const auto& indexLeaf = firstIndexLeaf[lid]; + (void)firstFloatLeaf; // stencil.moveTo uses its own acc + + for (auto it = indexLeaf.beginValueOn(); it; ++it) { + const nanovdb::Coord ijk = it.getCoord(); + stencil.moveTo(ijk); + const float r = stencil.normSqGrad(/*iso=*/0.f); + const uint64_t idx = indexAcc.getValue(ijk); + outputRef[idx] = r; + } + } + }); + }); +} + +// ============================================================ +// Fast pass — LegacyStencilAccessor gather + WenoStencil compute +// ============================================================ +// +// Structure: +// for each VBM block: +// decodeInverseMaps -> leafIndex[128], voxelOffset[128] +// for each batch of SIMDw voxels: +// fill: scalar scatter from sidecar into raw_values[SIZE][SIMDw] +// via LegacyStencilAccessor::moveTo per voxel +// load: per-tap SIMD load into stencil.values[] / isActive[] +// extrapolate (sign-fix OOB lanes in-place, Simd) +// normSqGrad -> FloatV +// store: per-lane scalar write to outputFast[blockBase + p] + +static double +runFast(const IndexGridT& indexGrid, + const nanovdb::tools::VoxelBlockManagerHandle& vbmHandle, + const std::vector& sidecar, + std::vector& outputFast) +{ + std::fill(outputFast.begin(), outputFast.end(), 0.f); + + const LeafT* firstLeaf = indexGrid.tree().template getFirstNode<0>(); + const uint32_t nBlocks = (uint32_t)vbmHandle.blockCount(); + const uint32_t* firstLeafID = vbmHandle.hostFirstLeafID(); + const uint64_t* jumpMap = vbmHandle.hostJumpMap(); + const uint64_t firstOffset = vbmHandle.firstOffset(); + + const float absBackground = std::abs(sidecar[0]); + const float dx = float(indexGrid.voxelSize()[0]); + + std::ostringstream sink; + nanovdb::util::Timer timer; + auto timeIt = [&](auto&& body) -> double { + timer.start("", sink); + body(); + return static_cast(timer.elapsed()); + }; + + return timeIt([&] { + nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), + [&](const nanovdb::util::Range1D& range) { + alignas(64) uint32_t leafIndex[BlockWidth]; + alignas(64) uint16_t voxelOffset[BlockWidth]; + + // Caller-owned fill-side scratch — scalar scatter from the + // sidecar lands here, then a per-tap SIMD load moves the + // data into the stencil's Simd compute view. + alignas(64) float raw_values[nanovdb::WenoStencil::size()][SIMDw]; + alignas(64) bool raw_active[nanovdb::WenoStencil::size()][SIMDw]; + + nanovdb::WenoStencil stencil(dx); + constexpr int SIZE = nanovdb::WenoStencil::size(); + using FloatV = nanovdb::util::Simd ; + using MaskV = nanovdb::util::SimdMask; + + // One LegacyStencilAccessor per TBB task (one ReadAccessor). + LegacyAccT legacyAcc(indexGrid); + + const float* const scIn = sidecar.data(); + float* const scOut = outputFast.data(); + + for (size_t bID = range.begin(); bID != range.end(); ++bID) { + CPUVBM::decodeInverseMaps( + &indexGrid, firstLeafID[bID], + &jumpMap[bID * CPUVBM::JumpMapLength], + firstOffset + bID * BlockWidth, + leafIndex, voxelOffset); + + const uint64_t blockBase = + firstOffset + (uint64_t)bID * BlockWidth; + + for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { + // -------- Fill: LegacyStencilAccessor per voxel -------- + // Voxel-outer, tap-inner inside the moveTo call + // (fillTaps unrolls the 19 tap lookups against the + // shared ReadAccessor). Zero-fill inactive lanes. + for (int i = 0; i < SIMDw; ++i) { + const int p = batchStart + i; + + if (leafIndex[p] == CPUVBM::UnusedLeafIndex) { + for (int k = 0; k < SIZE; ++k) { + raw_values[k][i] = 0.f; + raw_active[k][i] = false; + } + continue; + } + + const uint16_t vo = voxelOffset[p]; + const uint32_t li = leafIndex[p]; + const nanovdb::Coord cOrigin = firstLeaf[li].origin(); + const int lx = (vo >> 6) & 7, ly = (vo >> 3) & 7, lz = vo & 7; + const nanovdb::Coord center = + cOrigin + nanovdb::Coord(lx, ly, lz); + + legacyAcc.moveTo(center); + for (int k = 0; k < SIZE; ++k) { + const uint64_t idx = legacyAcc[k]; + raw_values[k][i] = scIn[idx]; + raw_active[k][i] = (idx != 0); + } + } + + // -------- Load: per-tap SIMD load into stencil view -------- + for (int k = 0; k < SIZE; ++k) { + stencil.values [k] = FloatV(raw_values[k], nanovdb::util::element_aligned); + stencil.isActive[k] = MaskV (raw_active[k], nanovdb::util::element_aligned); + } + + // -------- Phase-3 arithmetic (in-place on Simd values) -------- + stencil.extrapolate(absBackground); + const FloatV result = stencil.normSqGrad(/*iso=*/0.f); + + // -------- Simd -> scalar bridge + per-lane store -------- + alignas(64) float result_lanes[SIMDw]; + nanovdb::util::store(result, result_lanes, nanovdb::util::element_aligned); + for (int i = 0; i < SIMDw; ++i) { + const int p = batchStart + i; + if (leafIndex[p] == CPUVBM::UnusedLeafIndex) continue; + scOut[blockBase + p] = result_lanes[i]; + } + } + } + }); + }); +} + +// ============================================================ +// Histogram comparison +// ============================================================ +// +// Per-index |outputRef[i] - outputFast[i]| over all active voxels +// (index 1..N; slot 0 is the background/no-op). Log-decade bins +// from 0 to 1e+1 plus a tail bucket for anything >= 10. +// +// Expected shape: the two leftmost bins ([0,1e-8), [1e-8,1e-7)) hold +// the overwhelming majority — FP-rounding / FMA-fusion differences. +// Anything to the right of [1e-5,1e-4) warrants investigation. + +static void +reportHistogram(const std::vector& outputRef, + const std::vector& outputFast, + uint64_t nActive) +{ + // Bucket edges: 0, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e+0, 1e+1 + static constexpr int nBuckets = 12; + static constexpr double edges[nBuckets + 1] = { + 0.0, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6, + 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1.0, 10.0 + }; + static const char* labels[nBuckets] = { + "[0, 1e-10)", + "[1e-10, 1e-9 )", + "[1e-9, 1e-8 )", + "[1e-8, 1e-7 )", + "[1e-7, 1e-6 )", + "[1e-6, 1e-5 )", + "[1e-5, 1e-4 )", + "[1e-4, 1e-3 )", + "[1e-3, 1e-2 )", + "[1e-2, 1e-1 )", + "[1e-1, 1.0 )", + "[1.0, 10.0 )" + }; + + std::array counts{}; // last bucket = [10, inf) + double sumDelta = 0.0; + float maxDelta = 0.f; + uint64_t worstIdx = 0; + uint64_t counted = 0; + + for (uint64_t i = 1; i <= nActive; ++i) { + const float d = std::abs(outputRef[i] - outputFast[i]); + ++counted; + sumDelta += double(d); + if (d > maxDelta) { maxDelta = d; worstIdx = i; } + + int b = nBuckets; // overflow bucket + for (int k = 0; k < nBuckets; ++k) { + if (double(d) < edges[k + 1]) { b = k; break; } + } + ++counts[b]; + } + + std::printf("\n|Delta| histogram across %lu active voxels" + " (outputRef vs outputFast):\n", counted); + for (int k = 0; k < nBuckets; ++k) { + const double pct = counted ? 100.0 * double(counts[k]) / double(counted) : 0.0; + std::printf(" %-18s : %12lu (%6.2f%%)\n", + labels[k], counts[k], pct); + } + const double pctTail = counted ? 100.0 * double(counts[nBuckets]) / double(counted) : 0.0; + std::printf(" %-18s : %12lu (%6.2f%%)\n", + "[10.0, inf )", counts[nBuckets], pctTail); + + std::printf("\n max |Delta| = %.6g (at slot %lu:" + " ref=%.6g, fast=%.6g)\n", + double(maxDelta), worstIdx, + double(outputRef[worstIdx]), + double(outputFast[worstIdx])); + std::printf(" mean |Delta| = %.6g\n", + counted ? sumDelta / double(counted) : 0.0); +} + +// ============================================================ +// Entry point +// ============================================================ + +static void printUsage(const char* argv0) +{ + std::cerr + << "Usage: " << argv0 << " " + << " [--grid=] [--threads=]\n" + << "\n" + << " Input OpenVDB file (single FloatGrid narrow-band)\n" + << " --grid= Select grid by name (default: first FloatGrid)\n" + << " --threads= Limit TBB parallelism (0 = TBB default)\n"; +} + +int main(int argc, char** argv) +{ + try { + if (argc < 2 || std::string(argv[1]) == "--help" + || std::string(argv[1]) == "-h") { + printUsage(argv[0]); + return argc < 2 ? 1 : 0; + } + + std::string vdbPath = argv[1]; + std::string gridName = ""; + int nThreads = 0; + + for (int i = 2; i < argc; ++i) { + std::string a = argv[i]; + if (a.rfind("--grid=", 0) == 0) gridName = a.substr(7); + else if (a.rfind("--threads=", 0) == 0) nThreads = std::stoi(a.substr(10)); + else { printUsage(argv[0]); return 1; } + } + + std::cout << "vdb path = " << vdbPath << "\n" + << "grid name = " << (gridName.empty() ? "(first FloatGrid)" : gridName) << "\n" + << "threads = " << (nThreads > 0 ? std::to_string(nThreads) + : std::string("(TBB default)")) << "\n"; + + // ---- Load the .vdb and convert to both NanoVDB representations ---- + openvdb::initialize(); + auto floatGrid = loadFloatGridFromVdb(vdbPath, gridName); + + const auto bbox = floatGrid->evalActiveVoxelBoundingBox(); + const auto vsize = floatGrid->voxelSize(); + std::cout << "FloatGrid:\n" + << " active voxels = " << floatGrid->activeVoxelCount() << "\n" + << " bbox = [" << bbox.min() << " .. " << bbox.max() << "]\n" + << " voxel size = " << vsize << "\n" + << " background = " << floatGrid->background() << "\n"; + + auto payload = convertFloatGrid(*floatGrid); + auto* nanoFloatGrid = payload.floatHandle.grid(); + auto* indexGrid = payload.indexHandle.grid(); + if (!nanoFloatGrid || !indexGrid) + throw std::runtime_error("NanoVDB conversion failed"); + + const auto& tree = indexGrid->tree(); + std::cout << "NanoVDB:\n" + << " leaves = " << tree.nodeCount(0) << "\n" + << " active voxels = " << indexGrid->activeVoxelCount() << "\n" + << " sidecar size = " << payload.sidecar.size() << "\n"; + + // ---- VBM for the fast path ---- + auto vbmHandle = nanovdb::tools::buildVoxelBlockManager(indexGrid); + std::cout << "VBM:\n" + << " blocks = " << vbmHandle.blockCount() + << " (BlockWidth=" << BlockWidth << ")\n\n"; + + // ---- TBB thread cap for timings ---- + std::unique_ptr tbbLimit; + if (nThreads > 0) { + tbbLimit = std::make_unique( + tbb::global_control::max_allowed_parallelism, (size_t)nThreads); + } + + // ---- Output buffers ---- + std::vector outputRef (payload.sidecar.size(), 0.f); + std::vector outputFast(payload.sidecar.size(), 0.f); + + // ---- Run both passes (warm + timed) ---- + // Warm pass (ignored) for both, then one timed pass each. + (void)runReference(*nanoFloatGrid, *indexGrid, outputRef); + (void)runFast(*indexGrid, vbmHandle, payload.sidecar, outputFast); + + const double refUs = runReference(*nanoFloatGrid, *indexGrid, outputRef); + const double fastUs = runFast(*indexGrid, vbmHandle, payload.sidecar, outputFast); + + const uint64_t nActive = indexGrid->activeVoxelCount(); + const double refNs = refUs * 1e3 / double(nActive); + const double fastNs = fastUs * 1e3 / double(nActive); + + std::printf("\nEnd-to-end WENO5 |grad phi|^2 (%lu active voxels):\n", nActive); + std::printf(" reference (scalar): %9.1f ms (%7.1f ns/voxel)\n", + refUs / 1e3, refNs); + std::printf(" fast (VBM+SIMD) : %9.1f ms (%7.2f ns/voxel) speedup: %.1fx\n", + fastUs / 1e3, fastNs, refUs / std::max(fastUs, 1.0)); + + // ---- Histogram of discrepancies ---- + reportHistogram(outputRef, outputFast, nActive); + + } catch (const std::exception& e) { + std::cerr << "Exception: " << e.what() << "\n"; + return 1; + } + return 0; +} From 3ffe1e791248060a4545264b6e8dcb0ee8a7c3b6 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Tue, 21 Apr 2026 00:22:15 -0500 Subject: [PATCH 51/60] ex_weno_nanovdb_cpu: remove stray emacs lock symlink .#weno_nanovdb_cpu.cpp is an emacs per-user lock artifact that got swept up by `git add` in the previous commit. It's a dangling symlink to esifakis@host:pid, not a meaningful repo artifact. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Efty Sifakis --- .../nanovdb/examples/ex_weno_nanovdb_cpu/.#weno_nanovdb_cpu.cpp | 1 - 1 file changed, 1 deletion(-) delete mode 120000 nanovdb/nanovdb/examples/ex_weno_nanovdb_cpu/.#weno_nanovdb_cpu.cpp diff --git a/nanovdb/nanovdb/examples/ex_weno_nanovdb_cpu/.#weno_nanovdb_cpu.cpp b/nanovdb/nanovdb/examples/ex_weno_nanovdb_cpu/.#weno_nanovdb_cpu.cpp deleted file mode 120000 index afd2029cf8..0000000000 --- a/nanovdb/nanovdb/examples/ex_weno_nanovdb_cpu/.#weno_nanovdb_cpu.cpp +++ /dev/null @@ -1 +0,0 @@ -esifakis@esifakis-fct2250.31393:1776698410 \ No newline at end of file From 2d8d94fa719d384efbf052cb2331b497f094fb5c Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Wed, 22 Apr 2026 22:29:59 -0500 Subject: [PATCH 52/60] WenoStencil: self-contained Taps tuple, no more Weno5Stencil policy dep MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Consolidates the 19-tap WENO5 tuple into WenoStencil itself, drops the external dependency on StencilAccessor.h, and removes references to the Weno5Stencil policy struct / Hull (neither needed now that the hybrid SIMD StencilAccessor is on the way out per the cleanup plan). Before: WenoStencil.h -> #include using Taps = Weno5Stencil::Taps; tapIndex() -> detail::findIndex(); After: WenoStencil.h -> self-contained; no accessor include nested TapPoint as the tap-offset type nested using Taps = std::tuple, ...>; private static findTap() helper (compile-time inverse map, same pattern as the ex-detail::findIndex) Tap ordering unchanged — canonical WenoPt::idx convention (center, x-axis -3..+3, y-axis -3..+3, z-axis -3..+3). Hull intentionally absent (was only consumed by the hybrid StencilAccessor prefetch path, which is no longer exercised). StencilAccessor.h is left alone this pass: it still defines its own StencilPoint + detail::findIndex + Weno5Stencil policy (with both Taps and Hull) for the benchmark passes that still instantiate StencilAccessor<..., Weno5Stencil>. Those will fall away when the hybrid path is removed in a subsequent cleanup step; the decoupling here is preparation for that. LegacyStencilAccessor.h also unchanged — it still includes StencilAccessor.h for StencilPoint/findIndex, and its callers use Weno5Stencil as the policy type. WenoStencil::Taps could replace that role later, but since W=1 vs W=16 stencils would pin the policy to a specific lane width, the more natural move is to introduce a standalone tap-tuple header when StencilAccessor.h retires. Verification on taperLER.vdb (31.8M active voxels, 24 threads): - ex_weno_nanovdb_cpu, ex_narrowband_stencil_cpu, ex_stencil_gather_cpu all rebuild and link clean. - ex_weno_nanovdb_cpu output identical (slot-by-slot) to pre-refactor: 73.75% bit-exact, 3.53% in [1e-8,1e-7), 22.72% in [1e-7,1e-6), 0 voxels above 1e-6; max |Delta|=9.5e-7 at the same slot as before. - Timings within noise (94.8 ms fast / 121.1 ms reference). WenoStencil.md: - §3.3 text refreshed: kPairs indices now cross-reference WenoStencil::Taps rather than Weno5Stencil::Taps. - §5.3 tapIndex forwarding reference: detail::findIndex -> private static findTap. - §7.2 (the old "Consolidate the Weno5Stencil policy" to-do) removed; the consolidation it proposed is this commit. §7.3 renumbered to §7.2. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Efty Sifakis --- nanovdb/nanovdb/util/WenoStencil.h | 56 ++++++++++++++++++++++------- nanovdb/nanovdb/util/WenoStencil.md | 37 ++++++++----------- 2 files changed, 58 insertions(+), 35 deletions(-) diff --git a/nanovdb/nanovdb/util/WenoStencil.h b/nanovdb/nanovdb/util/WenoStencil.h index 410f5ebfad..43a8267b1d 100644 --- a/nanovdb/nanovdb/util/WenoStencil.h +++ b/nanovdb/nanovdb/util/WenoStencil.h @@ -40,7 +40,6 @@ #pragma once #include -#include // StencilPoint, Weno5Stencil, detail::findIndex #include // Pow2 #include @@ -132,12 +131,35 @@ template class WenoStencil { public: - static constexpr int SIZE = 19; - - using Taps = Weno5Stencil::Taps; using FloatV = util::Simd ; using MaskV = util::SimdMask; + // --- Tap-offset types (compile-time only) ----------------------------- + // TapPoint carries the tap offset as a type. Taps is the + // 19-tap tuple in the canonical WenoPt::idx ordering from + // nanovdb/math/Stencils.h: + // idx 0 : center < 0, 0, 0> + // idx 1.. 6 : x-axis <-3,0,0> <-2,0,0> <-1,0,0> <+1,0,0> <+2,0,0> <+3,0,0> + // idx 7..12 : y-axis <0,-3,0> <0,-2,0> <0,-1,0> <0,+1,0> <0,+2,0> <0,+3,0> + // idx 13..18 : z-axis <0,0,-3> <0,0,-2> <0,0,-1> <0,0,+1> <0,0,+2> <0,0,+3> + template + struct TapPoint { + static constexpr int di = DI, dj = DJ, dk = DK; + }; + + using Taps = std::tuple< + TapPoint< 0, 0, 0>, + TapPoint<-3, 0, 0>, TapPoint<-2, 0, 0>, TapPoint<-1, 0, 0>, + TapPoint<+1, 0, 0>, TapPoint<+2, 0, 0>, TapPoint<+3, 0, 0>, + TapPoint< 0,-3, 0>, TapPoint< 0,-2, 0>, TapPoint< 0,-1, 0>, + TapPoint< 0,+1, 0>, TapPoint< 0,+2, 0>, TapPoint< 0,+3, 0>, + TapPoint< 0, 0,-3>, TapPoint< 0, 0,-2>, TapPoint< 0, 0,-1>, + TapPoint< 0, 0,+1>, TapPoint< 0, 0,+2>, TapPoint< 0, 0,+3> + >; + + static constexpr int SIZE = int(std::tuple_size_v); + static constexpr int size() { return SIZE; } + // Compute-side storage — first-class Simd values. At W=1 these collapse // to plain scalar float / bool under the array backend. FloatV values [SIZE]; @@ -154,17 +176,13 @@ class WenoStencil NANOVDB_SIMD_HOSTDEV explicit WenoStencil(float dx) : mDx2(dx * dx), mInvDx2(1.f / (dx * dx)) {} - static constexpr int size() { return SIZE; } - // Compile-time named-tap access: returns the index of tap (DI,DJ,DK) in // the Taps tuple. Ordering matches WenoPt::idx in - // nanovdb/math/Stencils.h, so this is interoperable with canonical WENO - // index conventions. + // nanovdb/math/Stencils.h. template static constexpr int tapIndex() { - constexpr int I = detail::findIndex( - std::make_index_sequence{}); + constexpr int I = findTap(std::make_index_sequence{}); static_assert(I >= 0, "WenoStencil::tapIndex: tap not in stencil"); return I; } @@ -200,8 +218,22 @@ class WenoStencil [[gnu::always_inline]] NANOVDB_SIMD_HOSTDEV inline FloatV normSqGrad(float iso = 0.f) const; private: - // Hardcoded (tap, innerTap) pairs for Weno5Stencil::Taps, ordered by - // ascending |Δ|. Indices match the tuple definition in StencilAccessor.h. + // Compile-time inverse map: (DI,DJ,DK) → slot index in Taps. Returns -1 + // if no matching tap exists; tapIndex() turns that into a static_assert. + template + static constexpr int findTap(std::index_sequence) + { + int result = -1; + ((std::tuple_element_t::di == DI && + std::tuple_element_t::dj == DJ && + std::tuple_element_t::dk == DK && + result < 0 ? (result = int(Is)) : 0), ...); + return result; + } + + // Hardcoded (tap, innerTap) pairs for the 19-tap Taps tuple, ordered by + // ascending |Δ| so the inner tap is always already resolved when the + // outer tap is processed. Indices match the Taps tuple above. // // idx 0 : center ( 0, 0, 0) // idx 1.. 6 : x-axis (-3..+3 in the order -3,-2,-1,+1,+2,+3) diff --git a/nanovdb/nanovdb/util/WenoStencil.md b/nanovdb/nanovdb/util/WenoStencil.md index 98585cd478..590b933d2c 100644 --- a/nanovdb/nanovdb/util/WenoStencil.md +++ b/nanovdb/nanovdb/util/WenoStencil.md @@ -164,8 +164,8 @@ outer tap is processed. Sign propagation through a |Δ|=1 → |Δ|=2 → ### 3.3 The `kPairs[]` table -The inner-tap relationship is `Weno5Stencil`-specific and hardcoded as -a static table inside the class: +The inner-tap relationship is WENO5-specific and hardcoded as a static +table inside the class: ```cpp static constexpr int kPairs[18][2] = { @@ -178,16 +178,16 @@ static constexpr int kPairs[18][2] = { }; ``` -Indices match the tuple ordering in `Weno5Stencil::Taps` -(`StencilAccessor.h`) and `WenoPt::idx` in -`nanovdb/math/Stencils.h`. Center tap (idx 0) is not processed — +Indices match the `WenoStencil::Taps` tuple defined in +`WenoStencil.h` (same ordering as `WenoPt::idx` in +`nanovdb/math/Stencils.h`). Center tap (idx 0) is not processed — assumed always in-band. **Why hardcoded, not template-derived:** a generic scheme would walk -`Weno5Stencil::Taps` at compile time and derive inner-tap indices from -|Δ| and axis alignment. For a single stencil the table is 18 entries, -reads directly, and makes the cascade ordering self-documenting. -Worth revisiting if we add Weno7 or other axis-aligned WENO variants. +`Taps` at compile time and derive inner-tap indices from |Δ| and axis +alignment. For a single stencil the table is 18 entries, reads +directly, and makes the cascade ordering self-documenting. Worth +revisiting if we add Weno7 or other axis-aligned WENO variants. ### 3.4 `extrapolate()` implementation @@ -378,9 +378,9 @@ constexpr int xm3 = WenoStencil::tapIndex<-3, 0, 0>(); FloatV xm3Value = stencil.values[xm3]; ``` -`tapIndex()` forwards to `detail::findIndex` (shared with -`StencilAccessor`), static-asserting at compile time that the -requested tap exists in the Weno5Stencil::Taps tuple. +`tapIndex()` forwards to a private static `findTap` helper +inside `WenoStencil`, static-asserting at compile time that the +requested tap exists in the `Taps` tuple. --- @@ -428,20 +428,11 @@ taperLER.vdb; compare against `sidecar-stencil-extrap` (which writes the tap-sum instead of normSqGrad) to isolate the Phase-3 arithmetic cost. -### 7.2 Consolidate the Weno5Stencil policy - -Currently `Weno5Stencil` (the tap-tuple policy struct) lives in -`StencilAccessor.h` and is shared with `WenoStencil` via -`using Taps = Weno5Stencil::Taps`. The policy is arguably a -Weno-specific definition and could move into `WenoStencil.h`; -`StencilAccessor.h` would then `#include <.../WenoStencil.h>` for the -policy. Left as-is to minimise churn across files. - -### 7.3 Alternative stencils +### 7.2 Alternative stencils If/when Weno7 or a non-axis-aligned stencil is needed, the class would specialise on a stencil-policy template parameter rather than -hardcode `Weno5Stencil`: +hardcoding the 19-tap WENO5 shape: ```cpp template From 79004aa3f3579a484bc0065c82bcaf1736391db0 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Wed, 22 Apr 2026 23:19:09 -0500 Subject: [PATCH 53/60] =?UTF-8?q?nanovdb:=20cleanup=20=E2=80=94=20remove?= =?UTF-8?q?=20hybrid=20StencilAccessor/BatchAccessor=20stack?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Delete util/StencilAccessor.h, util/BatchAccessor.h - util/LegacyStencilAccessor.h: self-contained (inlines own findTap) - util/Util.h: add NANOVDB_FORCEINLINE macro; remove unused countTrailingZeros - util/Simd.h: retire NANOVDB_SIMD_HOSTDEV, use canonical __hostdev__; [[gnu::always_inline]] -> NANOVDB_FORCEINLINE - util/WenoStencil.h: same attribute/macro cleanup - examples/ex_weno_nanovdb_cpu: Weno5Stencil -> WenoStencil<> - examples/CMakeLists.txt: disable ex_narrowband_stencil_cpu and ex_stencil_gather_cpu (sources remain on disk for later removal). Verified bit-exact output on taperLER.vdb (max |Delta|=9.5e-7 at slot 12390855, unchanged). Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Efty Sifakis --- nanovdb/nanovdb/examples/CMakeLists.txt | 20 - .../ex_weno_nanovdb_cpu/weno_nanovdb_cpu.cpp | 2 +- nanovdb/nanovdb/util/BatchAccessor.h | 503 ------------------ nanovdb/nanovdb/util/LegacyStencilAccessor.h | 43 +- nanovdb/nanovdb/util/Simd.h | 177 +++--- nanovdb/nanovdb/util/StencilAccessor.h | 387 -------------- nanovdb/nanovdb/util/Util.h | 48 +- nanovdb/nanovdb/util/WenoStencil.h | 16 +- 8 files changed, 127 insertions(+), 1069 deletions(-) delete mode 100644 nanovdb/nanovdb/util/BatchAccessor.h delete mode 100644 nanovdb/nanovdb/util/StencilAccessor.h diff --git a/nanovdb/nanovdb/examples/CMakeLists.txt b/nanovdb/nanovdb/examples/CMakeLists.txt index 8fe727f696..6f84a514d0 100644 --- a/nanovdb/nanovdb/examples/CMakeLists.txt +++ b/nanovdb/nanovdb/examples/CMakeLists.txt @@ -121,26 +121,6 @@ if(TARGET ex_voxelBlockManager_host_cuda) $<$:-mavx2 -fopenmp-simd>) endif() -# CPU-only SIMD stencil gather prototype (Phase 1: neighbor leaf resolution). -# No CUDA required. Design in ex_voxelBlockManager_host_cuda/StencilGather.md. -nanovdb_example(NAME "ex_stencil_gather_cpu") -if(TARGET ex_stencil_gather_cpu) - target_compile_options(ex_stencil_gather_cpu PRIVATE -march=native -fopenmp-simd) - # simd_test/Simd.h lives three levels above this CMakeLists (at the repo root). - target_include_directories(ex_stencil_gather_cpu PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/../../..) -endif() - -# CPU-only stencil gather benchmark on a real narrow-band level-set .vdb -# (see BatchAccessor.md §8j for the isOn-branch investigation that motivates -# comparing random-occupancy vs realistic spatial-coherence workloads). -nanovdb_example(NAME "ex_narrowband_stencil_cpu" OPENVDB) -if(TARGET ex_narrowband_stencil_cpu) - target_compile_options(ex_narrowband_stencil_cpu PRIVATE -march=native -fopenmp-simd) - target_include_directories(ex_narrowband_stencil_cpu PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/../../..) -endif() - # End-to-end CPU WENO5 norm-square-gradient on a narrow-band level set, # with a scalar reference for correctness validation. # (See BatchAccessor.md §11 for the full Phase-2+3 pipeline this demonstrates.) diff --git a/nanovdb/nanovdb/examples/ex_weno_nanovdb_cpu/weno_nanovdb_cpu.cpp b/nanovdb/nanovdb/examples/ex_weno_nanovdb_cpu/weno_nanovdb_cpu.cpp index bdd1022b36..15f1b83800 100644 --- a/nanovdb/nanovdb/examples/ex_weno_nanovdb_cpu/weno_nanovdb_cpu.cpp +++ b/nanovdb/nanovdb/examples/ex_weno_nanovdb_cpu/weno_nanovdb_cpu.cpp @@ -85,7 +85,7 @@ using LeafT = nanovdb::NanoLeaf; using FloatGridT = nanovdb::NanoGrid; using CPUVBM = nanovdb::tools::VoxelBlockManager; -using LegacyAccT = nanovdb::LegacyStencilAccessor; +using LegacyAccT = nanovdb::LegacyStencilAccessor>; // ============================================================ // VDB loading and NanoVDB conversion diff --git a/nanovdb/nanovdb/util/BatchAccessor.h b/nanovdb/nanovdb/util/BatchAccessor.h deleted file mode 100644 index 67d13a86ad..0000000000 --- a/nanovdb/nanovdb/util/BatchAccessor.h +++ /dev/null @@ -1,503 +0,0 @@ -// Copyright Contributors to the OpenVDB Project -// SPDX-License-Identifier: Apache-2.0 - -/*! - \file BatchAccessor.h - - \brief SIMD-batch analog of NanoVDB's ValueAccessor. - - Caches the 27-entry 3x3x3 leaf-neighbor pointer table around the current - center leaf, amortizing probeLeaf calls across all batches that process - voxels within that leaf. - - Design documented in: - nanovdb/examples/ex_voxelBlockManager_host_cuda/BatchAccessor.md - - Template parameters - ------------------- - BuildT NanoVDB build type (determines tree / leaf types). - ValueT Scalar or SIMD result type of cachedGetValue. - For NanoGrid: float or Simd - For NanoGrid: uint64_t or Simd - VoxelOffsetT Compact (9-bit) voxel offset within a leaf. - Scalar path: uint16_t. SIMD path: Simd. - PredicateT Per-lane active predicate (the leafMask). - Scalar: bool. SIMD: SimdMask. - - Usage - ----- - Scalar defaults allow instantiation without a SIMD library. - For SIMD use, substitute the concrete Simd<> and SimdMask<> types. - - API (see BatchAccessor.md Sec.5 for the full design): - - advance(newLeafID) -- move to a new center leaf - - prefetch(vo, mask) -- warm cache for tap (di,dj,dk) - - cachedGetValue(result, vo, mask) -- fill masked result lanes -*/ - -#pragma once - -#include -#include -#include -#include -#include -#include - -namespace nanovdb { - -// ============================================================================= -// BatchAccessor -// ============================================================================= - -template -class BatchAccessor -{ - using GridT = NanoGrid; - using TreeT = typename GridT::TreeType; - using LeafT = typename TreeT::LeafNodeType; - - using VO_traits = util::simd_traits; - using Pred_traits = util::simd_traits; - using Val_traits = util::simd_traits; - - // Scalar element type of ValueT (e.g. float for Simd) - using ScalarValueT = typename Val_traits::scalar_type; - - static constexpr int LaneWidth = VO_traits::width; - - // SIMD bundle types for the ingredient gather. - // Degrade to plain scalar types when LaneWidth == 1. - using LeafIDVecT = std::conditional_t>; - using LeafDataVecT = std::conditional_t>; - - static_assert(VO_traits::width == Pred_traits::width, - "BatchAccessor: VoxelOffsetT and PredicateT must have the same lane width"); - static_assert(std::is_same_v || - std::is_same_v>, - "BatchAccessor: PredicateT must be bool (scalar) or SimdMask (SIMD)"); - static_assert(Val_traits::width == 1 || Val_traits::width == VO_traits::width, - "BatchAccessor: ValueT lane width must be 1 (scalar) or match VoxelOffsetT"); - - // The SWAR packed layout in prefetch occupies bits 0-14 of each element - // (max packed value 0x1CE7, max sum 0x4A52). The element type must therefore - // be an unsigned integer of at least 16 bits; signed types produce UB on - // carry overflow, and 8-bit types cannot hold the packed fields. - using VoxelOffsetScalarT = util::scalar_traits_t; - static_assert(std::is_unsigned_v, - "BatchAccessor: VoxelOffsetT element type must be unsigned " - "(SWAR carry detection requires wrap-around, not signed overflow)"); - static_assert(sizeof(VoxelOffsetScalarT) >= 2, - "BatchAccessor: VoxelOffsetT element type must be at least 16 bits " - "(SWAR packed layout occupies bits 0-14, max sum 0x4A52)"); - -public: - // ------------------------------------------------------------------------- - // Direction encoding - // - // dir(dx,dy,dz) = (dx+1)*9 + (dy+1)*3 + (dz+1), dx,dy,dz in {-1,0,+1} - // - // Selected entries: - // dir( 0, 0, 0) = 13 -- center leaf (mNeighborLeafIDs[13]) - // dir(-1, 0, 0) = 4 -- x-minus face - // dir(+1, 0, 0) = 22 -- x-plus face - // dir( 0,-1, 0) = 10 -- y-minus face - // dir( 0,+1, 0) = 16 -- y-plus face - // dir( 0, 0,-1) = 12 -- z-minus face - // dir( 0, 0,+1) = 14 -- z-plus face - // - // Sentinel leaf ID for directions outside the narrow band (no leaf exists). - // ------------------------------------------------------------------------- - static constexpr int dir(int dx, int dy, int dz) - { - return (dx + 1) * 9 + (dy + 1) * 3 + (dz + 1); - } - static constexpr uint32_t kNullLeafID = ~uint32_t(0); - - // ------------------------------------------------------------------------- - // SWAR 15-bit packed encoding constants - // - // packed layout: lx[10:12] | gap[13:14] | ly[5:7] | gap[8:9] | lz[0:2] | gap[3:4] - // - // kSwarXZMask -- keeps lz [0:2] and lx [6:8->10:12] after (vo | vo<<4) - // kSwarYMask -- keeps ly [3:5->5:7] after (vo<<2) - // kSwarSentinel-- inactive-lane value: lx=ly=lz=4, chosen so that - // (sentinel + tap) never triggers a false crossing signal - // ------------------------------------------------------------------------- - static constexpr uint16_t kSwarXZMask = 0x1C07u; - static constexpr uint16_t kSwarYMask = 0x00E0u; - static constexpr uint16_t kSwarSentinel = 4u | (4u << 5u) | (4u << 10u); - - // ------------------------------------------------------------------------- - // Construction - // - // Eagerly populates mNeighborLeafIDs[dir(0,0,0)] (the center leaf ID) and - // marks bit 13 in mProbedMask. The center ID is O(1) to compute - // (no probeLeaf needed), so there is no reason to defer it. - // - // Consequence: cachedGetValue<0,0,0> is valid immediately after construction - // without any prefetch call. The SWAR neededMask in prefetch never sets - // bit 13 (only crossings fire), so the eager center is never redundantly - // re-probed. - // ------------------------------------------------------------------------- - BatchAccessor(const GridT& grid, uint32_t firstLeafID) - : mGrid(grid) - , mCenterLeafID(firstLeafID) - , mCenterOrigin(grid.tree().getFirstLeaf()[firstLeafID].origin()) - , mProbedMask(1u << dir(0, 0, 0)) - , mFirstLeaf (grid.tree().getFirstLeaf()) - , mOffsetBase (reinterpret_cast(&grid.tree().getFirstLeaf()[0].data()->mOffset)) - , mPrefixBase (reinterpret_cast(&grid.tree().getFirstLeaf()[0].data()->mPrefixSum)) - , mMaskWordBase(grid.tree().getFirstLeaf()[0].valueMask().words()) - { - for (auto& id : mNeighborLeafIDs) id = kNullLeafID; - mNeighborLeafIDs[dir(0, 0, 0)] = mCenterLeafID; - } - - // ------------------------------------------------------------------------- - // centerLeafID -- read the current center leaf ID - // - // Exposed for StencilAccessor::moveTo, which needs it for the - // leafSlice == centerLeafID() comparison in the straddling loop. - // There is no raw setter; advance() is the sole legitimate transition. - // ------------------------------------------------------------------------- - uint32_t centerLeafID() const { return mCenterLeafID; } - - // ------------------------------------------------------------------------- - // advance -- move to a new center leaf - // - // Call when none_of(leafMask): all active lanes have moved past mCenterLeafID. - // Resets all neighbor IDs to kNullLeafID, repopulates the center eagerly, - // and resets mProbedMask to bit 13 so the center is immediately valid. - // Resetting all 27 IDs (108 bytes) ensures mNeighborLeafIDs[d] == kNullLeafID - // iff bit d is absent from mProbedMask -- a clean invariant for SIMD gather. - // ------------------------------------------------------------------------- - void advance(uint32_t newLeafID) - { - mCenterLeafID = newLeafID; - mCenterOrigin = mGrid.tree().getFirstLeaf()[newLeafID].origin(); - for (auto& id : mNeighborLeafIDs) id = kNullLeafID; - mNeighborLeafIDs[dir(0, 0, 0)] = newLeafID; - mProbedMask = (1u << dir(0, 0, 0)); - } - - // ------------------------------------------------------------------------- - // prefetch -- warm the neighbor cache for stencil tap (di,dj,dk) - // - // For each active (leafMask) lane, computes which neighbor leaf the tap lands - // in and probes it into mLeafNeighbors[] if not already cached in mProbedMask. - // - // The center direction (dir(0,0,0)) is always pre-populated by the constructor - // and advance(), so it never appears in neededMask and never needs probeLeaf. - // Every direction in toProbe is therefore a genuine neighbor: full root-to-leaf - // traversal via mGrid.tree().root().probeLeaf(). - // - // A null result from probeLeaf means the neighbor leaf does not exist (outside - // the narrow band); cachedGetValue returns 0 for those lanes. - // ------------------------------------------------------------------------- - template - void prefetch(VoxelOffsetT vo, PredicateT leafMask) - { - // ----------------------------------------------------------------------- - // SWAR neededMask computation - // - // Replace the scalar per-lane loop with a single SIMD add + two horizontal - // reductions, using a 15-bit packed coordinate representation. - // - // packed_lc layout -- 5-bit groups, tightly packed, no inter-group gaps: - // bits 0- 2: lz carry region bits 3-4 (z-axis) - // bits 5- 7: ly carry region bits 8-9 (y-axis) - // bits 10-12: lx carry region bits 13-14 (x-axis) - // - // All carry bits land within [0:14], fitting cleanly in uint16_t with - // bit 15 unused. The z,y,x ordering matches the weight sequence in - // dir(): (dz+1)x1 + (dy+1)x3 + (dx+1)x9. - // - // packed_tap = stencil offsets biased by +8, placed in the same groups: - // (dk+8) at bits [0:...] dk+8 in [5,11] for dk in [-3,3] - // (dj+8) at bits [5:...] - // (di+8) at bits [10:...] - // - // The +8 bias shifts the zero point so that the per-group sum - // s = lc + (d+8), lc in [0,7], d in [-3,3] -> s in [5,18] - // encodes the neighbor coordinate measured from the (-1,-1,-1) leaf: - // s in [ 5, 7]: component + d < 0 -> lo-neighbor (d < 0 case) - // s in [ 8,15]: component + d in [0,7] -> center leaf - // s in [16,18]: component + d >= 8 -> hi-neighbor (d > 0 case) - // - // Carry bits after add: - // bit[+3] SET <=> s >= 8 (= no lo-crossing) - // bit[+4] SET <=> s >= 16 (= hi-crossing) - // - // For prefetch, only one bit per axis is needed (compile-time dispatch): - // dk > 0: z_cross = hor_or & (1 << 4) -- any lane has hi-z carry - // dk < 0: z_cross = !(hor_and & (1 << 3)) -- any lane lacks lo-z guard - // (same at bits [9]/[8] for y, bits [14]/[13] for x) - // - // Inactive lanes carry sentinel lc = 4 per axis: s = d+12 in [9,15] - // -> bit[+3]=1, bit[+4]=0 -> no crossing signal regardless of d. (ok) - // - // For multi-axis taps, may-cross flags are combined conservatively. - // ----------------------------------------------------------------------- - - // Use VoxelOffsetT directly for the packed arithmetic: LaneWidth elements - // of VoxelOffsetScalarT in one register -> one vpaddw (16-bit) or vpaddd - // (32-bit) depending on the instantiation. All intermediate values fit: - // packed_lc <= 0x1CE7, packed_tap <= 0x2D6B, sum <= 0x4A52 < 2^16. - - // Compile-time packed stencil offset (+8-biased per axis, 5-bit groups). - static constexpr auto packed_tap = - static_cast( - (unsigned(dk) + 8u) - | ((unsigned(dj) + 8u) << 5) - | ((unsigned(di) + 8u) << 10)); - - // Expand the 9-bit voxel offset into the 15-bit SWAR packed form. - // vo = lx[6:8] | ly[3:5] | lz[0:2] (NanoVDB leaf layout) - // target: lx[10:12] | ly[5:7] | lz[0:2] - // - // (vo | (vo<<4)) & kSwarXZMask places lz (stays at [0:2]) and lx ([6:8]->[10:12]) - // in one OR+mask; (vo<<2) & kSwarYMask moves ly ([3:5]->[5:7]). - const auto expanded = - ((vo | (vo << VoxelOffsetScalarT(4))) & VoxelOffsetT(kSwarXZMask)) - | ((vo << VoxelOffsetScalarT(2)) & VoxelOffsetT(kSwarYMask)); - - // Blend: active lanes -> expanded form, straddle/inactive -> sentinel. - auto packed_lc = VoxelOffsetT(kSwarSentinel); - util::where(leafMask, packed_lc) = expanded; - - // One SIMD add across all LaneWidth lanes (one vpaddw/vpaddd instruction). - const auto packed_sum = packed_lc + VoxelOffsetT(packed_tap); - - // Horizontal reductions for the carry-bit checks. - const auto hor_or = util::reduce(packed_sum, std::bit_or<>{}); - const auto hor_and = util::reduce(packed_sum, std::bit_and<>{}); - - // Per-axis may-cross flags: compile-time dispatch on sign of d. - // Overflow (d>0): detected by the hi-carry bit (+4 from group base). - // Underflow (d<0): detected by absence of the lo-guard bit (+3). - bool x_cross = false, y_cross = false, z_cross = false; - if constexpr (dk > 0) z_cross = bool(hor_or & (1u << 4)); - if constexpr (dk < 0) z_cross = !bool(hor_and & (1u << 3)); - if constexpr (dj > 0) y_cross = bool(hor_or & (1u << 9)); - if constexpr (dj < 0) y_cross = !bool(hor_and & (1u << 8)); - if constexpr (di > 0) x_cross = bool(hor_or & (1u << 14)); - if constexpr (di < 0) x_cross = !bool(hor_and & (1u << 13)); - - // Compile-time crossing sign per axis. - constexpr int sx = (di > 0) ? 1 : -1; // only used when di != 0 - constexpr int sy = (dj > 0) ? 1 : -1; - constexpr int sz = (dk > 0) ? 1 : -1; - - // Build neededMask: face neighbors, then edge and corner (conservative). - uint32_t neededMask = 0u; - if constexpr (di != 0) { if (x_cross) neededMask |= (1u << dir(sx, 0, 0)); } - if constexpr (dj != 0) { if (y_cross) neededMask |= (1u << dir( 0, sy, 0)); } - if constexpr (dk != 0) { if (z_cross) neededMask |= (1u << dir( 0, 0, sz)); } - if constexpr (di != 0 && dj != 0) { if (x_cross && y_cross) neededMask |= (1u << dir(sx, sy, 0)); } - if constexpr (di != 0 && dk != 0) { if (x_cross && z_cross) neededMask |= (1u << dir(sx, 0, sz)); } - if constexpr (dj != 0 && dk != 0) { if (y_cross && z_cross) neededMask |= (1u << dir( 0, sy, sz)); } - if constexpr (di != 0 && dj != 0 && dk != 0) { if (x_cross && y_cross && z_cross) neededMask |= (1u << dir(sx, sy, sz)); } - - // Probe neighbor directions not already cached. - // Every direction here requires probeLeaf (center is pre-populated, never in toProbe). - uint32_t toProbe = neededMask & ~mProbedMask; - if (toProbe) { - const auto& root = mGrid.tree().root(); - do { - const int d = static_cast(util::countTrailingZeros(toProbe)); - const LeafT* leafPtr = root.probeLeaf(originForDir(d)); - mNeighborLeafIDs[d] = leafPtr - ? uint32_t(leafPtr - mGrid.tree().getFirstLeaf()) - : kNullLeafID; - mProbedMask |= (1u << d); - toProbe &= toProbe - 1; - } while (toProbe); - } - } - - // ------------------------------------------------------------------------- - // cachedGetValue -- fill masked result lanes from cached leaf table - // - // For each active (leafMask) lane, computes the local voxel offset within the - // appropriate neighbor leaf and calls leaf->getValue(offset). - // - // Requires prefetch (or any prefetch covering the same directions) - // to have been called first. - // - // A null leaf pointer (neighbor outside the narrow band) writes 0 to dst[lane]. - // Inactive lanes (bit lane of leafMask clear) are not touched. - // - // Output layout: `ScalarValueT (&dst)[LaneWidth]` — a plain C array, one - // entry per SIMD lane. This allows the scalar-tail loop below to write - // lane results with a single `mov`, avoiding the heterogeneous-mask - // where-blend that the old `Simd&` signature triggered. - // - // Hybrid design (BatchAccessor.md §8h / StencilAccessor.md §8.1): - // SIMD portion stays in native uint16_t __m256i (no aggregate ABI): - // SWAR expansion, packed_sum, base-32 direction extract (d_u16), - // local-offset extract (localOffset_u16). - // Harvest: two YMM stores to stack C arrays (neighborIdx[], localOffset[]). - // Scalar tail: per-lane pointer chase into mNeighborLeafIDs + mFirstLeaf, - // one leaf.getValue(offset) call. The LeafNode handles valueMask / - // prefixSum / popcount internally — one popcnt per lookup. - // ------------------------------------------------------------------------- - template - void cachedGetValue(ScalarValueT (&dst)[LaneWidth], - VoxelOffsetT vo, - PredicateT leafMask) const - { - // ---- SIMD portion (native __m256i uint16_t throughout — no aggregate ABI) ---- - static constexpr auto packed_tap = - static_cast( - (unsigned(dk) + 8u) - | ((unsigned(dj) + 8u) << 5) - | ((unsigned(di) + 8u) << 10)); - // SWAR expansion of the (x,y,z) local position of the center voxel. - // Inactive-lane values are garbage; the scalar tail below filters them - // out via the leafMask bitmask, so no sentinel / where-blend is needed. - const auto expanded = - ((vo | (vo << VoxelOffsetScalarT(4))) & VoxelOffsetT(kSwarXZMask)) - | ((vo << VoxelOffsetScalarT(2)) & VoxelOffsetT(kSwarYMask)); - const auto packed_sum = expanded + VoxelOffsetT(packed_tap); - - // Per-lane direction index (0..26) via the base-32 multiply trick (§8d). - // Stays in uint16_t — bits [10:14] of (v * 1129) lie entirely below bit 16, - // so the modular uint16_t product gives the same result as the full-width - // product for all valid inputs. No int32 widening → no _Fixed aggregate. - static constexpr uint16_t kSwarCarryMask = 0x6318u; - static constexpr uint16_t kDirMul = 1129u; - static constexpr uint16_t kDirMask = 31u; - const auto d_u16 = (((packed_sum & VoxelOffsetT(kSwarCarryMask)) - >> VoxelOffsetScalarT(3)) - * VoxelOffsetT(kDirMul) - >> VoxelOffsetScalarT(10)) - & VoxelOffsetT(kDirMask); - - // Per-lane 9-bit local offset in the destination leaf. - // NanoVDB leaf layout: offset = (destX << 6) | (destY << 3) | destZ. - // packed_sum bits: destX=[10:12], destY=[5:7], destZ=[0:2] - // output bits: destX=[6:8], destY=[3:5], destZ=[0:2] - const auto localOffset_u16 = - ((packed_sum >> VoxelOffsetScalarT(4)) & VoxelOffsetT(0x1C0u)) - | ((packed_sum >> VoxelOffsetScalarT(2)) & VoxelOffsetT(0x38u)) - | (packed_sum & VoxelOffsetT(0x07u)); - - // ---- Harvest SIMD → C arrays and scalar tail ---- - if constexpr (LaneWidth == 1) { - if (!leafMask) return; // inactive: leave dst[0] alone - const uint32_t leafID = mNeighborLeafIDs[uint32_t(d_u16)]; - if (leafID == kNullLeafID) { dst[0] = ScalarValueT(0); return; } - dst[0] = static_cast( - mFirstLeaf[leafID].getValue(uint32_t(localOffset_u16))); - } else { - alignas(32) uint16_t localOffset[LaneWidth]; - alignas(32) uint16_t neighborIdx[LaneWidth]; - util::store(localOffset_u16, localOffset); - util::store(d_u16, neighborIdx); - - // Convert SIMD leafMask → uint32_t bitmask once; then a single - // scalar loop over active lanes with no further SIMD in sight. - const uint32_t activeBits = util::to_bitmask(leafMask); - for (int lane = 0; lane < LaneWidth; ++lane) { - if (!((activeBits >> lane) & 1u)) continue; - const uint32_t leafID = mNeighborLeafIDs[neighborIdx[lane]]; - if (leafID == kNullLeafID) { - dst[lane] = ScalarValueT(0); - continue; - } - dst[lane] = static_cast( - mFirstLeaf[leafID].getValue(localOffset[lane])); - } - } - } - - // ------------------------------------------------------------------------- - // cachedGetValueInLeaf -- benchmarking variant that forces all - // taps to stay in the center leaf via mod-8 wrap. - // - // Purpose: measure the hybrid pipeline's floor cost when all taps - // access the SAME leaf, with distinct per-tap / per-lane positions (so - // the compiler can't CSE across taps, and we still exercise different - // mValueMask words and prefix-sum slots). The result is semantically - // target_local = (voxel_local + (di,dj,dk)) mod 8 - // with target always in the center leaf (direction code 0). - // - // Implementation: same SWAR + harvest + scalar-tail pipeline as - // cachedGetValue, but after `packed_sum = expanded + packed_tap` we mask - // with kSwarFieldMask = 0x1CE7 to discard all inter-field carry bits, - // which is exactly `x mod 8 | y mod 8 | z mod 8` in the packed layout. - // - // Requires di, dj, dk in [0, 7]. No prefetch call needed; the center - // leaf is always in mNeighborLeafIDs[13] from construction/advance. - // ------------------------------------------------------------------------- - template - void cachedGetValueInLeaf(ScalarValueT (&dst)[LaneWidth], - VoxelOffsetT vo, - PredicateT leafMask) const - { - static_assert(di >= 0 && di < 8 && dj >= 0 && dj < 8 && dk >= 0 && dk < 8, - "cachedGetValueInLeaf: tap offsets must be in [0, 7] per axis"); - - static constexpr auto packed_tap = - static_cast( - unsigned(dk) - | (unsigned(dj) << 5) - | (unsigned(di) << 10)); - const auto expanded = - ((vo | (vo << VoxelOffsetScalarT(4))) & VoxelOffsetT(kSwarXZMask)) - | ((vo << VoxelOffsetScalarT(2)) & VoxelOffsetT(kSwarYMask)); - // Mask off inter-field carry bits → per-axis mod-8 wrap; always center. - static constexpr uint16_t kSwarFieldMask = 0x1CE7u; - const auto packed_sum = - (expanded + VoxelOffsetT(packed_tap)) & VoxelOffsetT(kSwarFieldMask); - - // Extract 9-bit local offset (same layout as cachedGetValue). - const auto localOffset_u16 = - ((packed_sum >> VoxelOffsetScalarT(4)) & VoxelOffsetT(0x1C0u)) - | ((packed_sum >> VoxelOffsetScalarT(2)) & VoxelOffsetT(0x38u)) - | (packed_sum & VoxelOffsetT(0x07u)); - - if constexpr (LaneWidth == 1) { - if (!leafMask) return; - dst[0] = static_cast( - mFirstLeaf[mCenterLeafID].getValue(uint32_t(localOffset_u16))); - } else { - alignas(32) uint16_t localOffset[LaneWidth]; - util::store(localOffset_u16, localOffset); - const uint32_t activeBits = util::to_bitmask(leafMask); - const LeafT* const leaf = &mFirstLeaf[mCenterLeafID]; // hoisted - for (int lane = 0; lane < LaneWidth; ++lane) { - if (!((activeBits >> lane) & 1u)) continue; - dst[lane] = static_cast( - leaf->getValue(localOffset[lane])); - } - } - } - -private: - // Compute the world-space origin of the leaf at direction bit d from center. - // bit(dx,dy,dz) = (dx+1)*9 + (dy+1)*3 + (dz+1); leaf stride = 8 per axis. - Coord originForDir(int d) const - { - const int dx = d / 9 - 1; - const int dy = (d / 3) % 3 - 1; - const int dz = d % 3 - 1; - return mCenterOrigin + Coord(dx * 8, dy * 8, dz * 8); - } - - const GridT& mGrid; - uint32_t mCenterLeafID; - Coord mCenterOrigin; - uint32_t mProbedMask; - uint32_t mNeighborLeafIDs[27]; // kNullLeafID when not probed or outside narrow band - const LeafT* const mFirstLeaf; // getFirstLeaf() — scalar-tail leaf lookup base - const uint64_t* const mOffsetBase; // &getFirstLeaf()[0].data()->mOffset - const uint64_t* const mPrefixBase; // &getFirstLeaf()[0].data()->mPrefixSum - const uint64_t* const mMaskWordBase; // getFirstLeaf()[0].valueMask().words() -}; - -} // namespace nanovdb diff --git a/nanovdb/nanovdb/util/LegacyStencilAccessor.h b/nanovdb/nanovdb/util/LegacyStencilAccessor.h index a53c29ef5c..06a2a263b3 100644 --- a/nanovdb/nanovdb/util/LegacyStencilAccessor.h +++ b/nanovdb/nanovdb/util/LegacyStencilAccessor.h @@ -7,22 +7,13 @@ \brief Scalar stencil-index accessor using a NanoVDB ReadAccessor. LegacyStencilAccessor resolves each stencil tap via a path-cached - NanoVDB ReadAccessor, one voxel at a time. It is templatized on the - same StencilT policy class used by StencilAccessor, so the tap-offset - table is shared at compile time. + NanoVDB ReadAccessor, one voxel at a time. It is templatized on a + StencilT policy class whose Taps tuple defines the tap offsets. This mirrors the approach of OpenVDB's math/Stencils.h: the accessor caches the last-visited tree path so that consecutive taps within the same leaf are cheap, but distant taps (e.g. WENO5 radius-3 offsets) - can evict the center-leaf path. That cache-pressure problem is the - motivation for the BatchAccessor / StencilAccessor design. - - Intended uses - ------------- - - Correctness oracle for StencilAccessor: sharing StencilT guarantees - identical tap offsets, so a mismatch is a genuine bug. - - Benchmark baseline: measures the cost of the accessor path-eviction - problem that StencilAccessor is designed to eliminate. + can evict the center-leaf path. Thread safety ------------- @@ -32,14 +23,14 @@ ------------------- BuildT NanoVDB build type (e.g. ValueOnIndex). StencilT Policy class describing the stencil. Must expose: - using Taps = std::tuple...>; - Same type as passed to StencilAccessor. + using Taps = std::tuple...>; + where each S is any type with static int members di, dj, dk + (e.g. WenoStencil<>::TapPoint). */ #pragma once #include -#include // StencilPoint, detail::findIndex #include #include @@ -55,6 +46,22 @@ class LegacyStencilAccessor static constexpr int SIZE = int(std::tuple_size_v); + // Compile-time inverse map: (DI,DJ,DK) → slot index in StencilT::Taps. + // Returns -1 if no matching tap exists; getValue() turns that into a + // static_assert. Same shape as WenoStencil::findTap (kept local here + // to avoid a cross-header dependency). + template + static constexpr int findTap(std::index_sequence) + { + using Taps = typename StencilT::Taps; + int result = -1; + ((std::tuple_element_t::di == DI && + std::tuple_element_t::dj == DJ && + std::tuple_element_t::dk == DK && + result < 0 ? (result = int(Is)) : 0), ...); + return result; + } + public: // Leaf-only ReadAccessor (cache level 0 only). The DefaultReadAccessor // (levels 0/1/2) caches upper and lower nodes too, but those slots are @@ -88,15 +95,11 @@ class LegacyStencilAccessor // ------------------------------------------------------------------------- // getValue -- compile-time named tap access. - // - // Same interface as StencilAccessor::getValue; resolved entirely at - // compile time via detail::findIndex. // ------------------------------------------------------------------------- template uint64_t getValue() const { - constexpr int I = detail::findIndex( - std::make_index_sequence{}); + constexpr int I = findTap(std::make_index_sequence{}); static_assert(I >= 0, "LegacyStencilAccessor::getValue: tap not in stencil"); return mStencil[I]; } diff --git a/nanovdb/nanovdb/util/Simd.h b/nanovdb/nanovdb/util/Simd.h index af88a6028a..c73c15f8be 100644 --- a/nanovdb/nanovdb/util/Simd.h +++ b/nanovdb/nanovdb/util/Simd.h @@ -2,6 +2,8 @@ #include #include +#include // __hostdev__ + // Minimal SIMD abstraction for NanoVDB stencil kernels. // // Two implementations, selected automatically at compile time: @@ -21,15 +23,6 @@ // // Mirrors the C++26 std::simd naming — migration will be a typedef swap. -// --------------------------------------------------------------------------- -// Portability: __hostdev__ is a no-op outside CUDA -// --------------------------------------------------------------------------- -#ifndef __CUDACC__ -# define NANOVDB_SIMD_HOSTDEV -#else -# define NANOVDB_SIMD_HOSTDEV __host__ __device__ -#endif - // --------------------------------------------------------------------------- // Auto-detect std::experimental::simd (Parallelism TS v2) // --------------------------------------------------------------------------- @@ -68,22 +61,22 @@ using SimdMask = stdx::fixed_size_simd_mask; template using Simd = stdx::fixed_size_simd; -// [[gnu::always_inline]] forces these thin wrappers to inline at every -// call site. Without it, GCC's cost model sometimes outlines them — -// each call then pays a function-call + vzeroupper + register-ABI -// transition that dominates the one-instruction body (vminps / vmaxps / -// vblendvps). See BatchAccessor.md §8h for the analogous fix on the -// StencilAccessor path. +// NANOVDB_FORCEINLINE (see Util.h) forces these thin wrappers to inline +// at every call site. Without it, GCC's cost model sometimes outlines +// them — each call then pays a function-call + vzeroupper + register- +// ABI transition that dominates the one-instruction body +// (vminps / vmaxps / vblendvps). See BatchAccessor.md §8h for the +// analogous fix on the StencilAccessor path. template -[[gnu::always_inline]] inline Simd min(Simd a, Simd b) { return stdx::min(a, b); } +NANOVDB_FORCEINLINE Simd min(Simd a, Simd b) { return stdx::min(a, b); } template -[[gnu::always_inline]] inline Simd max(Simd a, Simd b) { return stdx::max(a, b); } +NANOVDB_FORCEINLINE Simd max(Simd a, Simd b) { return stdx::max(a, b); } // TS v2 where(mask, v) is a masked assignment proxy, not a 3-arg select. // Wrap it into the select(mask, a, b) form our kernels expect. template -[[gnu::always_inline]] inline Simd where(SimdMask mask, Simd a, Simd b) { +NANOVDB_FORCEINLINE Simd where(SimdMask mask, Simd a, Simd b) { auto result = b; stdx::where(mask, result) = a; return result; @@ -91,7 +84,7 @@ template // Heterogeneous where: mask element type U ≠ value element type T. // Converts the U-mask to a T-mask via a boolean round-trip. template -[[gnu::always_inline]] inline Simd where(SimdMask mask, Simd a, Simd b) { +NANOVDB_FORCEINLINE Simd where(SimdMask mask, Simd a, Simd b) { bool arr[W]; for (int i = 0; i < W; i++) arr[i] = static_cast(mask[i]); SimdMask tmask(arr, element_aligned); @@ -183,25 +176,25 @@ template struct SimdMask { std::array data{}; SimdMask() = default; - NANOVDB_SIMD_HOSTDEV explicit SimdMask(const bool* p, element_aligned_tag) { + __hostdev__ explicit SimdMask(const bool* p, element_aligned_tag) { for (int i = 0; i < W; i++) data[i] = p[i]; } // Converting constructor: copy bool values from a mask over a different element type. // All SimdMask are boolean arrays of the same width; this allows // where(SimdMask, Simd, Simd) without explicit casting. template - NANOVDB_SIMD_HOSTDEV explicit SimdMask(SimdMask const& o) { + __hostdev__ explicit SimdMask(SimdMask const& o) { for (int i = 0; i < W; i++) data[i] = o[i]; } - NANOVDB_SIMD_HOSTDEV bool operator[](int i) const { return data[i]; } - NANOVDB_SIMD_HOSTDEV bool& operator[](int i) { return data[i]; } - NANOVDB_SIMD_HOSTDEV SimdMask operator!() const { + __hostdev__ bool operator[](int i) const { return data[i]; } + __hostdev__ bool& operator[](int i) { return data[i]; } + __hostdev__ SimdMask operator!() const { SimdMask r; for (int i = 0; i < W; i++) r.data[i] = !data[i]; return r; } - NANOVDB_SIMD_HOSTDEV SimdMask operator&(SimdMask o) const { + __hostdev__ SimdMask operator&(SimdMask o) const { SimdMask r; for (int i = 0; i < W; i++) r.data[i] = data[i] && o.data[i]; return r; } - NANOVDB_SIMD_HOSTDEV SimdMask operator|(SimdMask o) const { + __hostdev__ SimdMask operator|(SimdMask o) const { SimdMask r; for (int i = 0; i < W; i++) r.data[i] = data[i] || o.data[i]; return r; } }; @@ -211,100 +204,100 @@ struct Simd { std::array data{}; Simd() = default; - NANOVDB_SIMD_HOSTDEV Simd(T scalar) { data.fill(scalar); } // broadcast - NANOVDB_SIMD_HOSTDEV explicit Simd(const T* p, element_aligned_tag) { // load + __hostdev__ Simd(T scalar) { data.fill(scalar); } // broadcast + __hostdev__ explicit Simd(const T* p, element_aligned_tag) { // load for (int i = 0; i < W; i++) data[i] = p[i]; } - NANOVDB_SIMD_HOSTDEV T operator[](int i) const { return data[i]; } - NANOVDB_SIMD_HOSTDEV T& operator[](int i) { return data[i]; } - NANOVDB_SIMD_HOSTDEV void store(T* p, element_aligned_tag = {}) const { // store + __hostdev__ T operator[](int i) const { return data[i]; } + __hostdev__ T& operator[](int i) { return data[i]; } + __hostdev__ void store(T* p, element_aligned_tag = {}) const { // store for (int i = 0; i < W; i++) p[i] = data[i]; } - NANOVDB_SIMD_HOSTDEV Simd operator-() const { + __hostdev__ Simd operator-() const { Simd r; for (int i = 0; i < W; i++) r.data[i] = -data[i]; return r; } - NANOVDB_SIMD_HOSTDEV Simd operator+(Simd o) const { + __hostdev__ Simd operator+(Simd o) const { Simd r; for (int i = 0; i < W; i++) r.data[i] = data[i] + o.data[i]; return r; } - NANOVDB_SIMD_HOSTDEV Simd operator-(Simd o) const { + __hostdev__ Simd operator-(Simd o) const { Simd r; for (int i = 0; i < W; i++) r.data[i] = data[i] - o.data[i]; return r; } - NANOVDB_SIMD_HOSTDEV Simd operator*(Simd o) const { + __hostdev__ Simd operator*(Simd o) const { Simd r; for (int i = 0; i < W; i++) r.data[i] = data[i] * o.data[i]; return r; } - NANOVDB_SIMD_HOSTDEV Simd operator/(Simd o) const { + __hostdev__ Simd operator/(Simd o) const { Simd r; for (int i = 0; i < W; i++) r.data[i] = data[i] / o.data[i]; return r; } - NANOVDB_SIMD_HOSTDEV SimdMask operator>(Simd o) const { + __hostdev__ SimdMask operator>(Simd o) const { SimdMask m; for (int i = 0; i < W; i++) m.data[i] = data[i] > o.data[i]; return m; } - NANOVDB_SIMD_HOSTDEV SimdMask operator==(Simd o) const { + __hostdev__ SimdMask operator==(Simd o) const { SimdMask m; for (int i = 0; i < W; i++) m.data[i] = data[i] == o.data[i]; return m; } - NANOVDB_SIMD_HOSTDEV SimdMask operator!=(Simd o) const { + __hostdev__ SimdMask operator!=(Simd o) const { SimdMask m; for (int i = 0; i < W; i++) m.data[i] = data[i] != o.data[i]; return m; } // Bitwise and shift operators — valid for integer element types. - NANOVDB_SIMD_HOSTDEV Simd operator|(Simd o) const { + __hostdev__ Simd operator|(Simd o) const { Simd r; for (int i = 0; i < W; i++) r.data[i] = data[i] | o.data[i]; return r; } - NANOVDB_SIMD_HOSTDEV Simd operator&(Simd o) const { + __hostdev__ Simd operator&(Simd o) const { Simd r; for (int i = 0; i < W; i++) r.data[i] = data[i] & o.data[i]; return r; } - NANOVDB_SIMD_HOSTDEV Simd operator^(Simd o) const { + __hostdev__ Simd operator^(Simd o) const { Simd r; for (int i = 0; i < W; i++) r.data[i] = data[i] ^ o.data[i]; return r; } // Per-lane variable shift (shift count from corresponding lane of o). - NANOVDB_SIMD_HOSTDEV Simd operator<<(Simd o) const { + __hostdev__ Simd operator<<(Simd o) const { Simd r; for (int i = 0; i < W; i++) r.data[i] = data[i] << o.data[i]; return r; } - NANOVDB_SIMD_HOSTDEV Simd operator>>(Simd o) const { + __hostdev__ Simd operator>>(Simd o) const { Simd r; for (int i = 0; i < W; i++) r.data[i] = data[i] >> o.data[i]; return r; } // Uniform shift: all lanes shifted by the same scalar count (vpsllw imm8 / vpsrlw imm8). - NANOVDB_SIMD_HOSTDEV Simd operator<<(T shift) const { + __hostdev__ Simd operator<<(T shift) const { Simd r; for (int i = 0; i < W; i++) r.data[i] = data[i] << shift; return r; } - NANOVDB_SIMD_HOSTDEV Simd operator>>(T shift) const { + __hostdev__ Simd operator>>(T shift) const { Simd r; for (int i = 0; i < W; i++) r.data[i] = data[i] >> shift; return r; } }; -template NANOVDB_SIMD_HOSTDEV +template __hostdev__ Simd operator+(T a, Simd b) { return Simd(a) + b; } -template NANOVDB_SIMD_HOSTDEV +template __hostdev__ Simd operator+(Simd a, T b) { return a + Simd(b); } -template NANOVDB_SIMD_HOSTDEV +template __hostdev__ Simd operator-(T a, Simd b) { return Simd(a) - b; } -template NANOVDB_SIMD_HOSTDEV +template __hostdev__ Simd operator-(Simd a, T b) { return a - Simd(b); } -template NANOVDB_SIMD_HOSTDEV +template __hostdev__ Simd operator*(T a, Simd b) { return Simd(a) * b; } -template NANOVDB_SIMD_HOSTDEV +template __hostdev__ Simd operator*(Simd a, T b) { return a * Simd(b); } -template NANOVDB_SIMD_HOSTDEV +template __hostdev__ Simd operator/(T a, Simd b) { return Simd(a) / b; } -template NANOVDB_SIMD_HOSTDEV +template __hostdev__ Simd operator/(Simd a, T b) { return a / Simd(b); } template -NANOVDB_SIMD_HOSTDEV Simd min(Simd a, Simd b) { +__hostdev__ Simd min(Simd a, Simd b) { Simd r; for (int i = 0; i < W; i++) r[i] = a[i] < b[i] ? a[i] : b[i]; return r; } template -NANOVDB_SIMD_HOSTDEV Simd max(Simd a, Simd b) { +__hostdev__ Simd max(Simd a, Simd b) { Simd r; for (int i = 0; i < W; i++) r[i] = a[i] > b[i] ? a[i] : b[i]; return r; } template -NANOVDB_SIMD_HOSTDEV Simd where(SimdMask mask, Simd a, Simd b) { +__hostdev__ Simd where(SimdMask mask, Simd a, Simd b) { Simd r; for (int i = 0; i < W; i++) r[i] = mask[i] ? a[i] : b[i]; return r; } // Heterogeneous where: mask element type U need not match value element type T. // Useful for applying PredicateT=SimdMask to VoxelOffsetT=Simd. template -NANOVDB_SIMD_HOSTDEV Simd where(SimdMask mask, Simd a, Simd b) { +__hostdev__ Simd where(SimdMask mask, Simd a, Simd b) { Simd r; for (int i = 0; i < W; i++) r[i] = mask[i] ? a[i] : b[i]; return r; } @@ -315,14 +308,14 @@ template struct WhereExpression { const SimdMask& mask; Simd& target; - NANOVDB_SIMD_HOSTDEV WhereExpression& operator=(const Simd& value) { + __hostdev__ WhereExpression& operator=(const Simd& value) { for (int i = 0; i < W; ++i) if (mask[i]) target[i] = value[i]; return *this; } }; template -NANOVDB_SIMD_HOSTDEV WhereExpression where(const SimdMask& mask, Simd& target) { +__hostdev__ WhereExpression where(const SimdMask& mask, Simd& target) { return {mask, target}; } @@ -330,32 +323,32 @@ NANOVDB_SIMD_HOSTDEV WhereExpression where(const SimdMask& mask, Sim // Mirrors std::experimental::reduce(v, binary_op). // Use with std::bit_or<>{}, std::bit_and<>{}, std::plus<>{}, etc. template -NANOVDB_SIMD_HOSTDEV T reduce(Simd v, BinaryOp op) { +__hostdev__ T reduce(Simd v, BinaryOp op) { T r = v[0]; for (int i = 1; i < W; ++i) r = op(r, v[i]); return r; } template -NANOVDB_SIMD_HOSTDEV bool any_of(SimdMask m) { +__hostdev__ bool any_of(SimdMask m) { bool r = false; for (int i = 0; i < W; i++) r |= m[i]; return r; } template -NANOVDB_SIMD_HOSTDEV bool none_of(SimdMask m) { return !any_of(m); } +__hostdev__ bool none_of(SimdMask m) { return !any_of(m); } template -NANOVDB_SIMD_HOSTDEV bool all_of(SimdMask m) { +__hostdev__ bool all_of(SimdMask m) { bool r = true; for (int i = 0; i < W; i++) r &= m[i]; return r; } // Store W lanes of v into p[0..W-1] (array-backend passthrough to member). template -NANOVDB_SIMD_HOSTDEV void store(Simd v, T* p, element_aligned_tag = {}) { +__hostdev__ void store(Simd v, T* p, element_aligned_tag = {}) { v.store(p); } // Unmasked gather: result[i] = ptr[idx[i]] for all lanes. template -NANOVDB_SIMD_HOSTDEV Simd gather(const T* __restrict__ ptr, Simd idx) { +__hostdev__ Simd gather(const T* __restrict__ ptr, Simd idx) { Simd r; for (int i = 0; i < W; i++) r[i] = ptr[idx[i]]; return r; @@ -364,7 +357,7 @@ NANOVDB_SIMD_HOSTDEV Simd gather(const T* __restrict__ ptr, Simd id // Masked gather: result[i] = mask[i] ? ptr[idx[i]] : fallback. // Scalar path: accesses ptr only for true lanes (ternary short-circuits). template -NANOVDB_SIMD_HOSTDEV Simd gather(SimdMask mask, const T* __restrict__ ptr, +__hostdev__ Simd gather(SimdMask mask, const T* __restrict__ ptr, Simd idx, T fallback = T(0)) { Simd r; for (int i = 0; i < W; i++) r[i] = mask[i] ? ptr[idx[i]] : fallback; @@ -375,7 +368,7 @@ NANOVDB_SIMD_HOSTDEV Simd gather(SimdMask mask, const T* __restrict__ // MaskElemT may differ from T (heterogeneous mask). // Scalar path: only accesses ptr for true lanes. template -NANOVDB_SIMD_HOSTDEV void gather_if(Simd& dst, SimdMask mask, +__hostdev__ void gather_if(Simd& dst, SimdMask mask, const T* __restrict__ ptr, Simd idx) { for (int i = 0; i < W; i++) if (mask[i]) dst[i] = ptr[idx[i]]; @@ -394,7 +387,7 @@ NANOVDB_SIMD_HOSTDEV void gather_if(Simd& dst, SimdMask mask, // Scalar overload: degrades to static_cast for plain scalar types. // --------------------------------------------------------------------------- template -NANOVDB_SIMD_HOSTDEV Simd simd_cast(Simd src) { +__hostdev__ Simd simd_cast(Simd src) { #ifdef NANOVDB_USE_STD_SIMD return Simd([&](int i) { return static_cast(src[i]); }); #else @@ -404,7 +397,7 @@ NANOVDB_SIMD_HOSTDEV Simd simd_cast(Simd src) { #endif } template -NANOVDB_SIMD_HOSTDEV DstT simd_cast(SrcT src) { return static_cast(src); } +__hostdev__ DstT simd_cast(SrcT src) { return static_cast(src); } // --------------------------------------------------------------------------- // simd_cast_if — masked element-wise cast (merge-masked). @@ -419,11 +412,11 @@ NANOVDB_SIMD_HOSTDEV DstT simd_cast(SrcT src) { return static_cast(src); } // Scalar fallback: plain conditional cast. // --------------------------------------------------------------------------- template -NANOVDB_SIMD_HOSTDEV void simd_cast_if(Simd& dst, SimdMask mask, Simd src) { +__hostdev__ void simd_cast_if(Simd& dst, SimdMask mask, Simd src) { dst = where(mask, simd_cast(src), dst); } template -NANOVDB_SIMD_HOSTDEV void simd_cast_if(DstT& dst, bool mask, SrcT src) { +__hostdev__ void simd_cast_if(DstT& dst, bool mask, SrcT src) { if (mask) dst = static_cast(src); } @@ -435,7 +428,7 @@ NANOVDB_SIMD_HOSTDEV void simd_cast_if(DstT& dst, bool mask, SrcT src) { // The final byte-sum uses a shift-and-add tree instead of the multiply trick // (v * 0x0101...) since 64x64->64 multiply has no AVX2 equivalent (vpmullq is AVX-512). // --------------------------------------------------------------------------- -NANOVDB_SIMD_HOSTDEV inline uint64_t popcount64(uint64_t v) +__hostdev__ inline uint64_t popcount64(uint64_t v) { v -= (v >> 1) & uint64_t(0x5555555555555555); v = (v & uint64_t(0x3333333333333333)) + ((v >> 2) & uint64_t(0x3333333333333333)); @@ -449,7 +442,7 @@ NANOVDB_SIMD_HOSTDEV inline uint64_t popcount64(uint64_t v) // Lane-wise SIMD popcount: applies popcount64 to every lane. // Backend A: generator constructor; Backend B: element loop (auto-vectorized by GCC/Clang). template -NANOVDB_SIMD_HOSTDEV Simd popcount(Simd v) { +__hostdev__ Simd popcount(Simd v) { #ifdef NANOVDB_USE_STD_SIMD return Simd([&](int i) { return popcount64(v[i]); }); #else @@ -458,7 +451,7 @@ NANOVDB_SIMD_HOSTDEV Simd popcount(Simd v) { return r; #endif } -NANOVDB_SIMD_HOSTDEV inline uint64_t popcount(uint64_t v) { return popcount64(v); } +__hostdev__ inline uint64_t popcount(uint64_t v) { return popcount64(v); } // --------------------------------------------------------------------------- // simd_traits — generic per-lane access for scalar and Simd types. @@ -474,16 +467,16 @@ template struct simd_traits { static constexpr int width = 1; using scalar_type = T; - NANOVDB_SIMD_HOSTDEV static T get(T v, int) { return v; } - NANOVDB_SIMD_HOSTDEV static void set(T& v, int, T val) { v = val; } + __hostdev__ static T get(T v, int) { return v; } + __hostdev__ static void set(T& v, int, T val) { v = val; } }; template<> struct simd_traits { static constexpr int width = 1; using scalar_type = bool; - NANOVDB_SIMD_HOSTDEV static bool get(bool m, int) { return m; } - NANOVDB_SIMD_HOSTDEV static void set(bool& m, int, bool v) { m = v; } + __hostdev__ static bool get(bool m, int) { return m; } + __hostdev__ static void set(bool& m, int, bool v) { m = v; } }; // Simd and SimdMask: valid for both backends because the aliases @@ -492,16 +485,16 @@ template struct simd_traits> { static constexpr int width = W; using scalar_type = T; - NANOVDB_SIMD_HOSTDEV static T get(Simd v, int i) { return v[i]; } - NANOVDB_SIMD_HOSTDEV static void set(Simd& v, int i, T val) { v[i] = val; } + __hostdev__ static T get(Simd v, int i) { return v[i]; } + __hostdev__ static void set(Simd& v, int i, T val) { v[i] = val; } }; template struct simd_traits> { static constexpr int width = W; using scalar_type = bool; - NANOVDB_SIMD_HOSTDEV static bool get(SimdMask m, int i) { return m[i]; } - NANOVDB_SIMD_HOSTDEV static void set(SimdMask& m, int i, bool v) { m[i] = v; } + __hostdev__ static bool get(SimdMask m, int i) { return m[i]; } + __hostdev__ static void set(SimdMask& m, int i, bool v) { m[i] = v; } }; // --------------------------------------------------------------------------- @@ -527,7 +520,7 @@ using scalar_traits_t = typename scalar_traits::type; // T is the associated element type; only W matters. Requires W <= 32. // --------------------------------------------------------------------------- template -NANOVDB_SIMD_HOSTDEV uint32_t to_bitmask(SimdMask m) { +__hostdev__ uint32_t to_bitmask(SimdMask m) { static_assert(W <= 32, "to_bitmask: W must be <= 32"); uint32_t r = 0; for (int i = 0; i < W; i++) if (m[i]) r |= (1u << i); @@ -537,31 +530,31 @@ NANOVDB_SIMD_HOSTDEV uint32_t to_bitmask(SimdMask m) { // --------------------------------------------------------------------------- // Scalar overloads — always present, for T=float (GPU / scalar path) // --------------------------------------------------------------------------- -template NANOVDB_SIMD_HOSTDEV T min(T a, T b) { return a < b ? a : b; } -template NANOVDB_SIMD_HOSTDEV T max(T a, T b) { return a > b ? a : b; } -template NANOVDB_SIMD_HOSTDEV T where(bool m, T a, T b) { return m ? a : b; } +template __hostdev__ T min(T a, T b) { return a < b ? a : b; } +template __hostdev__ T max(T a, T b) { return a > b ? a : b; } +template __hostdev__ T where(bool m, T a, T b) { return m ? a : b; } template -NANOVDB_SIMD_HOSTDEV T reduce(T v, BinaryOp) { return v; } +__hostdev__ T reduce(T v, BinaryOp) { return v; } // 2-argument where: scalar masked-assignment proxy matching the Simd form. // where(mask, target) = value writes value into target only if mask is true. template struct ScalarWhereProxy { bool mask; T& target; - NANOVDB_SIMD_HOSTDEV void operator=(const T& v) { if (mask) target = v; } + __hostdev__ void operator=(const T& v) { if (mask) target = v; } }; template -NANOVDB_SIMD_HOSTDEV ScalarWhereProxy where(bool mask, T& target) { +__hostdev__ ScalarWhereProxy where(bool mask, T& target) { return {mask, target}; } // Unmasked scalar gather: result = ptr[idx]. template -NANOVDB_SIMD_HOSTDEV T gather(const T* __restrict__ ptr, IdxT idx) { return ptr[idx]; } +__hostdev__ T gather(const T* __restrict__ ptr, IdxT idx) { return ptr[idx]; } // Merge-masked scalar gather: dst = ptr[idx] only if mask, else dst unchanged. template -NANOVDB_SIMD_HOSTDEV void gather_if(T& dst, bool mask, const T* __restrict__ ptr, IdxT idx) { +__hostdev__ void gather_if(T& dst, bool mask, const T* __restrict__ ptr, IdxT idx) { if (mask) dst = ptr[idx]; } diff --git a/nanovdb/nanovdb/util/StencilAccessor.h b/nanovdb/nanovdb/util/StencilAccessor.h deleted file mode 100644 index 7429a11f64..0000000000 --- a/nanovdb/nanovdb/util/StencilAccessor.h +++ /dev/null @@ -1,387 +0,0 @@ -// Copyright Contributors to the OpenVDB Project -// SPDX-License-Identifier: Apache-2.0 - -/*! - \file StencilAccessor.h - - \brief SIMD stencil-index gatherer built on BatchAccessor. - - Wraps a BatchAccessor and owns the straddling loop, prefetch-hull - sequencing, and per-tap cachedGetValue calls for one VBM block. - Its output is a fixed-size array of Simd — one vector - per stencil tap — containing ValueOnIndex indices for all W lanes. - - Design documented in: - nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilAccessor.md - - Template parameters - ------------------- - BuildT NanoVDB build type (e.g. ValueOnIndex). - W SIMD lane width. - StencilT Policy class describing the stencil. Must expose: - using Taps = std::tuple...>; - using Hull = std::tuple...>; - UnusedLeafIndex - Sentinel written by decodeInverseMaps for padding slots. - Defaults to ~uint32_t(0) (VoxelBlockManagerBase::UnusedLeafIndex). - - Usage - ----- - Construct once per VBM block; call moveTo() for each SIMD batch. - See StencilAccessor.md §10 for the caller pattern. -*/ - -#pragma once - -#include -#include -#include - -#include -#include // std::memset -#include -#include -#include -#include // std::index_sequence, std::make_index_sequence - -namespace nanovdb { - -// ============================================================================= -// StencilPoint — compile-time stencil tap offset -// ============================================================================= - -/// Compile-time 3D offset used as a type (not a value) in StencilT::Taps -/// and StencilT::Hull tuples. -template -struct StencilPoint { - static constexpr int di = DI; - static constexpr int dj = DJ; - static constexpr int dk = DK; -}; - -// ============================================================================= -// findIndex — compile-time inverse map: (DI,DJ,DK) → slot index in a Taps tuple -// ============================================================================= - -namespace detail { - -/// Returns the first index Is in [0,N) where tuple_element_t -/// matches (DI,DJ,DK), or -1 if not found. -template -constexpr int findIndex(std::index_sequence) -{ - int result = -1; - // Fold: for each Is, if the tap matches and we haven't found one yet, record it. - ((std::tuple_element_t::di == DI && - std::tuple_element_t::dj == DJ && - std::tuple_element_t::dk == DK && - result < 0 - ? (result = int(Is)) : 0), ...); - return result; -} - -} // namespace detail - -// ============================================================================= -// Weno5Stencil — 19-tap axis-aligned WENO5 stencil, radius 3 -// ============================================================================= - -/// Concrete StencilT for the WENO5 3D stencil. -/// Taps: 19 axis-aligned offsets — the center plus {±1,±2,±3} along each of x,y,z. -/// Hull: 6 extremal offsets that cover all 18 non-center tap crossing directions. -/// -/// Tap ordering matches WenoPt::idx in nanovdb/math/Stencils.h: -/// idx 0 : <0,0,0> -/// idx 1.. 6 : x-axis <-3,0,0> <-2,0,0> <-1,0,0> <+1,0,0> <+2,0,0> <+3,0,0> -/// idx 7..12 : y-axis <0,-3,0> <0,-2,0> <0,-1,0> <0,+1,0> <0,+2,0> <0,+3,0> -/// idx 13..18 : z-axis <0,0,-3> <0,0,-2> <0,0,-1> <0,0,+1> <0,0,+2> <0,0,+3> -struct Weno5Stencil { - using Taps = std::tuple< - // center - StencilPoint< 0, 0, 0>, - // x-axis - StencilPoint<-3, 0, 0>, StencilPoint<-2, 0, 0>, StencilPoint<-1, 0, 0>, - StencilPoint<+1, 0, 0>, StencilPoint<+2, 0, 0>, StencilPoint<+3, 0, 0>, - // y-axis - StencilPoint< 0,-3, 0>, StencilPoint< 0,-2, 0>, StencilPoint< 0,-1, 0>, - StencilPoint< 0,+1, 0>, StencilPoint< 0,+2, 0>, StencilPoint< 0,+3, 0>, - // z-axis - StencilPoint< 0, 0,-3>, StencilPoint< 0, 0,-2>, StencilPoint< 0, 0,-1>, - StencilPoint< 0, 0,+1>, StencilPoint< 0, 0,+2>, StencilPoint< 0, 0,+3> - >; - // Hull = 6 extremal taps that collectively probe all reachable face-neighbor - // directions for any combination of voxel position and non-center WENO5 tap. - // The center tap never crosses a leaf, so it's absent here by design. - // See StencilAccessor.md §4b for the monotonicity argument. - using Hull = std::tuple< - StencilPoint<-3, 0, 0>, StencilPoint<+3, 0, 0>, - StencilPoint< 0,-3, 0>, StencilPoint< 0,+3, 0>, - StencilPoint< 0, 0,-3>, StencilPoint< 0, 0,+3> - >; -}; - -// ============================================================================= -// StencilAccessor -// ============================================================================= - -template -class StencilAccessor -{ - using GridT = NanoGrid; - - // ------------------------------------------------------------------------- - // Private type aliases — only used inside moveTo(). - // - // These are the W-lane SIMD types that carry the input arrays through the - // straddling loop and the SWAR direction extraction. They do NOT appear - // in the public API: callers consume `mIndices` (raw uint64_t[SIZE][W]) - // directly, and `moveTo` returns `void` — active-lane information is read - // from `leafIndex[]` vs `UnusedLeafIndex` by the caller. - // ------------------------------------------------------------------------- - using OffsetVec = std::conditional_t>; - using LeafIdVec = std::conditional_t>; - using LeafMaskVec = std::conditional_t>; - - using BatchAcc = std::conditional_t, - BatchAccessor>; - - static constexpr int SIZE = int(std::tuple_size_v); - static constexpr int HULL_SIZE = int(std::tuple_size_v); - -public: - // ------------------------------------------------------------------------- - // Public API — entirely free of Simd<>/SimdMask<> types. - // - // Storage layout: `mIndices[tap][lane]` is a plain uint64_t. Callers are - // free to SIMD-load it with whatever backend they choose - // (e.g. `Simd::load(stencilAcc.mIndices[k], element_aligned)`), - // iterate scalarly, or pass slices to downstream kernels — we don't - // impose a choice. - // - // Layout is part of the ABI: [SIZE][W] row-major. Changing it is - // a breaking change. - // ------------------------------------------------------------------------- - alignas(64) uint64_t mIndices[SIZE][W]; - - // ------------------------------------------------------------------------- - // Construction - // - // firstLeafID -- VBM block's starting leaf ID (vbm.hostFirstLeafID()[blockID]). - // nExtraLeaves -- number of distinct center-leaf advances possible in this block - // (computed by the caller from the jumpMap). Debug-only bound - // on the straddling loop; not needed for correctness. - // ------------------------------------------------------------------------- - StencilAccessor(const GridT& grid, uint32_t firstLeafID, uint32_t nExtraLeaves) - : mBatch(grid, firstLeafID) -#ifndef NDEBUG - , mNExtraLeaves(nExtraLeaves) -#endif - { - (void)nExtraLeaves; - } - - // ------------------------------------------------------------------------- - // moveTo -- fill mIndices[0..SIZE-1][0..W-1] for a W-wide batch. - // - // leafIndex -- ptr to leafIndex[batchStart] (uint32_t from decodeInverseMaps) - // voxelOffset -- ptr to voxelOffset[batchStart] (uint16_t from decodeInverseMaps) - // - // Active-lane semantics: a lane i is "active" iff - // leafIndex[i] != UnusedLeafIndex - // Active lanes receive their 19 tap indices in mIndices[k][i]. - // Inactive lanes are zeroed (NanoVDB background index). - // - // Caller pattern: - // stencilAcc.moveTo(leafIndex + bs, voxelOffset + bs); - // for (int i = 0; i < W; ++i) { - // if (leafIndex[bs + i] == UnusedLeafIndex) continue; - // ...stencilAcc.mIndices[k][i]... - // } - // - // See StencilAccessor.md §8 for the full straddling loop design. - // ------------------------------------------------------------------------- - void moveTo(const uint32_t* leafIndex, const uint16_t* voxelOffset) - { - // Zero the whole results buffer — inactive lanes stay 0. - std::memset(mIndices, 0, sizeof(mIndices)); - - // Load the batch into SIMD registers for the SWAR / straddling logic. - const LeafIdVec leafSlice = loadLeafIdVec(leafIndex); - const OffsetVec voVec = loadOffsetVec(voxelOffset); - - // Initial active-lane mask (which lanes have real voxels). - LeafMaskVec activeMask = (leafSlice != LeafIdVec(UnusedLeafIndex)); - - if (util::none_of(activeMask)) return; - -#ifndef NDEBUG - uint32_t nAdvances = 0; -#endif - - // Straddling loop: consume one center leaf's worth of lanes per iteration. - while (util::any_of(activeMask)) { - const LeafMaskVec leafMask = - activeMask & (leafSlice == LeafIdVec(mBatch.centerLeafID())); - - if (util::none_of(leafMask)) { - // No lanes for this leaf — advance to next. - mBatch.advance(mBatch.centerLeafID() + 1); -#ifndef NDEBUG - assert(++nAdvances <= mNExtraLeaves); -#endif - continue; - } - - // Prefetch hull — warms all neighbor-leaf directions the full - // stencil can reach, before any cachedGetValue runs. - prefetchHull(voVec, leafMask, std::make_index_sequence{}); - - // Fill all SIZE tap entries for the lanes in leafMask. - calcTaps(voVec, leafMask, std::make_index_sequence{}); - - // Remove processed lanes. - activeMask = activeMask & !leafMask; - } - } - - // ------------------------------------------------------------------------- - // moveToInLeaf -- benchmarking variant: identical to moveTo except that - // each tap is wrapped to the center leaf via (localVoxel + tap) mod 8. - // - // Purpose: measure the hybrid pipeline's floor cost with 18 distinct - // compile-time taps that all access the SAME leaf, preventing both the - // cross-leaf L1 pressure and the compiler CSE of identical taps. All - // StencilT::Taps offsets must be in [0, 7] per axis. - // - // NOT for production use -- results have no geometric meaning; they - // just exercise the hybrid's code path under a controlled cache regime. - // ------------------------------------------------------------------------- - void moveToInLeaf(const uint32_t* leafIndex, const uint16_t* voxelOffset) - { - std::memset(mIndices, 0, sizeof(mIndices)); - - const LeafIdVec leafSlice = loadLeafIdVec(leafIndex); - const OffsetVec voVec = loadOffsetVec(voxelOffset); - - LeafMaskVec activeMask = (leafSlice != LeafIdVec(UnusedLeafIndex)); - - if (util::none_of(activeMask)) return; - -#ifndef NDEBUG - uint32_t nAdvances = 0; -#endif - - while (util::any_of(activeMask)) { - const LeafMaskVec leafMask = - activeMask & (leafSlice == LeafIdVec(mBatch.centerLeafID())); - - if (util::none_of(leafMask)) { - mBatch.advance(mBatch.centerLeafID() + 1); -#ifndef NDEBUG - assert(++nAdvances <= mNExtraLeaves); -#endif - continue; - } - - // No prefetchHull — all targets are the center leaf by construction. - calcTapsInLeaf(voVec, leafMask, std::make_index_sequence{}); - - activeMask = activeMask & !leafMask; - } - } - - // ------------------------------------------------------------------------- - // tapIndex() -- compile-time tap lookup. - // - // Returns the slot in mIndices that corresponds to a named stencil tap, - // resolved at compile time against StencilT::Taps. A tap that is not in - // the stencil produces a static_assert. - // - // Usage (reorder-safe, zero runtime cost): - // auto& xm3 = stencilAcc.mIndices[SAccT::tapIndex<-3,0,0>()]; - // ------------------------------------------------------------------------- - template - static constexpr int tapIndex() - { - constexpr int I = detail::findIndex( - std::make_index_sequence{}); - static_assert(I >= 0, "StencilAccessor::tapIndex: tap not in stencil"); - return I; - } - - static constexpr int size() { return SIZE; } - -private: - // ------------------------------------------------------------------------- - // Private helpers - // ------------------------------------------------------------------------- - - // Load LeafIdVec from a uint32_t pointer (scalar or SIMD). - static LeafIdVec loadLeafIdVec(const uint32_t* p) - { - if constexpr (W == 1) return *p; - else return LeafIdVec(p, util::element_aligned); - } - - // Load OffsetVec from a uint16_t pointer (scalar or SIMD). - static OffsetVec loadOffsetVec(const uint16_t* p) - { - if constexpr (W == 1) return *p; - else return OffsetVec(p, util::element_aligned); - } - - // Compile-time fold: prefetch all HULL_SIZE hull directions. - template - void prefetchHull(OffsetVec voVec, LeafMaskVec leafMask, std::index_sequence) - { - using Hull = typename StencilT::Hull; - (mBatch.template prefetch< - std::tuple_element_t::di, - std::tuple_element_t::dj, - std::tuple_element_t::dk - >(voVec, leafMask), ...); - } - - // Compile-time fold: cachedGetValue for all SIZE taps, write directly into mIndices. - // No where-blend: cachedGetValue's scalar tail writes only leafMask-active - // lanes; lanes outside leafMask keep whatever was written by a previous - // straddling-loop iteration (or zero from the initial memset). - template - void calcTaps(OffsetVec voVec, LeafMaskVec leafMask, std::index_sequence) - { - using Taps = typename StencilT::Taps; - (mBatch.template cachedGetValue< - std::tuple_element_t::di, - std::tuple_element_t::dj, - std::tuple_element_t::dk - >(mIndices[Is], voVec, leafMask), ...); - } - - // Benchmark-only counterpart: forces all taps into the center leaf. - template - void calcTapsInLeaf(OffsetVec voVec, LeafMaskVec leafMask, std::index_sequence) - { - using Taps = typename StencilT::Taps; - (mBatch.template cachedGetValueInLeaf< - std::tuple_element_t::di, - std::tuple_element_t::dj, - std::tuple_element_t::dk - >(mIndices[Is], voVec, leafMask), ...); - } - - // ------------------------------------------------------------------------- - // Members - // ------------------------------------------------------------------------- - - BatchAcc mBatch; // owns neighbor-leaf cache, mCenterLeafID - -#ifndef NDEBUG - uint32_t mNExtraLeaves; // removable sanity bound on center-leaf advances -#endif -}; - -} // namespace nanovdb diff --git a/nanovdb/nanovdb/util/Util.h b/nanovdb/nanovdb/util/Util.h index 2c61a205f8..4040e9bfd0 100644 --- a/nanovdb/nanovdb/util/Util.h +++ b/nanovdb/nanovdb/util/Util.h @@ -87,15 +87,22 @@ typedef unsigned long long uint64_t; #endif // if defined(__CUDACC__) || defined(__HIP__) -// NANOVDB_RESTRICT: cross-compiler no-alias hint for pointer parameters. -// GCC and Clang (including NVCC host compilation) spell it __restrict__, -// MSVC spells it __restrict. +// NANOVDB_RESTRICT: cross-compiler no-alias hint for pointer parameters #if defined(_MSC_VER) #define NANOVDB_RESTRICT __restrict #else #define NANOVDB_RESTRICT __restrict__ #endif +// NANOVDB_FORCEINLINE: force inlining at the call site +#if defined(_MSC_VER) +#define NANOVDB_FORCEINLINE __forceinline +#elif defined(__GNUC__) || defined(__clang__) +#define NANOVDB_FORCEINLINE inline __attribute__((always_inline)) +#else +#define NANOVDB_FORCEINLINE inline +#endif + // The following macro will suppress annoying warnings when nvcc // compiles functions that call (host) intrinsics (which is perfectly valid) #if defined(_MSC_VER) && defined(__CUDACC__) @@ -609,41 +616,6 @@ __hostdev__ inline uint32_t findLowestOn(uint64_t v) #endif }// util::findLowestOn(uint64_t) -// -------------------> countTrailingZeros <---------------------------- - -/// @brief Returns the number of trailing zero bits in the specified 32 bit word, -/// i.e. the index of the lowest set bit. -/// -/// @warning Assumes that at least one bit is set in the word, i.e. @a v != uint32_t(0)! -NANOVDB_HOSTDEV_DISABLE_WARNING -__hostdev__ inline uint32_t countTrailingZeros(uint32_t v) -{ - NANOVDB_ASSERT(v); -#if (defined(__CUDA_ARCH__) || defined(__HIP__)) && defined(NANOVDB_USE_INTRINSICS) - return __ffs(v) - 1; // one based indexing -#elif defined(_MSC_VER) && defined(NANOVDB_USE_INTRINSICS) - unsigned long index; - _BitScanForward(&index, v); - return static_cast(index); -#elif (defined(__GNUC__) || defined(__clang__)) && defined(NANOVDB_USE_INTRINSICS) - return static_cast(__builtin_ctz(v)); -#else - //NANO_WARNING("Using software implementation for util::countTrailingZeros(uint32_t v)") - static const unsigned char DeBruijn[32] = { - 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9}; -// disable unary minus on unsigned warning -#if defined(_MSC_VER) && !defined(__NVCC__) -#pragma warning(push) -#pragma warning(disable : 4146) -#endif - return DeBruijn[uint32_t((v & -v) * 0x077CB531U) >> 27]; -#if defined(_MSC_VER) && !defined(__NVCC__) -#pragma warning(pop) -#endif - -#endif -}// util::countTrailingZeros(uint32_t) - // -------------------> findHighestOn <---------------------------- /// @brief Returns the index of the highest, i.e. most significant, on bit in the specified 32 bit word diff --git a/nanovdb/nanovdb/util/WenoStencil.h b/nanovdb/nanovdb/util/WenoStencil.h index 43a8267b1d..3e384c7617 100644 --- a/nanovdb/nanovdb/util/WenoStencil.h +++ b/nanovdb/nanovdb/util/WenoStencil.h @@ -63,7 +63,7 @@ namespace detail { // numerical epsilon; kept as a plain float for broadcast-on-demand. // --------------------------------------------------------------------------- template -[[gnu::always_inline]] NANOVDB_SIMD_HOSTDEV inline T +__hostdev__ NANOVDB_FORCEINLINE T WENO5(const T& v1, const T& v2, const T& v3, const T& v4, const T& v5, float scale2 = 1.f) @@ -97,7 +97,7 @@ WENO5(const T& v1, const T& v2, const T& v3, // degenerates this to the same semantics as the if/else. // --------------------------------------------------------------------------- template -[[gnu::always_inline]] NANOVDB_SIMD_HOSTDEV inline T +__hostdev__ NANOVDB_FORCEINLINE T GodunovsNormSqrd(MaskT isOutside, T dP_xm, T dP_xp, T dP_ym, T dP_yp, @@ -172,8 +172,8 @@ class WenoStencil float mDx2{1.f}; // dx² — fed to WENO5's epsilon via scale2 float mInvDx2{1.f}; // 1 / dx² — final normalisation in normSqGrad - NANOVDB_SIMD_HOSTDEV WenoStencil() = default; - NANOVDB_SIMD_HOSTDEV explicit WenoStencil(float dx) + __hostdev__ WenoStencil() = default; + __hostdev__ explicit WenoStencil(float dx) : mDx2(dx * dx), mInvDx2(1.f / (dx * dx)) {} // Compile-time named-tap access: returns the index of tap (DI,DJ,DK) in @@ -199,7 +199,7 @@ class WenoStencil // // Requires absBackground ≥ 0. // ------------------------------------------------------------------ - [[gnu::always_inline]] NANOVDB_SIMD_HOSTDEV inline void extrapolate(float absBackground); + __hostdev__ NANOVDB_FORCEINLINE void extrapolate(float absBackground); // ------------------------------------------------------------------ // normSqGrad — Godunov's norm-square of the fifth-order WENO upwind @@ -215,7 +215,7 @@ class WenoStencil // normSqGrad after extrapolate is the typical pipeline shape, but the // method itself does not require extrapolate to have been called. // ------------------------------------------------------------------ - [[gnu::always_inline]] NANOVDB_SIMD_HOSTDEV inline FloatV normSqGrad(float iso = 0.f) const; + __hostdev__ NANOVDB_FORCEINLINE FloatV normSqGrad(float iso = 0.f) const; private: // Compile-time inverse map: (DI,DJ,DK) → slot index in Taps. Returns -1 @@ -265,7 +265,7 @@ class WenoStencil // and W>1 (native SIMD width). // --------------------------------------------------------------------------- template -[[gnu::always_inline]] NANOVDB_SIMD_HOSTDEV inline void +__hostdev__ NANOVDB_FORCEINLINE void WenoStencil::extrapolate(float absBackground) { const FloatV absBg(absBackground); @@ -294,7 +294,7 @@ WenoStencil::extrapolate(float absBackground) // combinator only (free on x86; identity at W=1). // --------------------------------------------------------------------------- template -[[gnu::always_inline]] NANOVDB_SIMD_HOSTDEV inline typename WenoStencil::FloatV +__hostdev__ NANOVDB_FORCEINLINE typename WenoStencil::FloatV WenoStencil::normSqGrad(float iso) const { const FloatV* v = values; From 412cd4b04c666147783a90dd644e211b3cbfad8a Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Mon, 27 Apr 2026 17:23:35 -0500 Subject: [PATCH 54/60] nanovdb: drop HaloStencilAccessor design doc and cross-references MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HaloStencilAccessor was a speculative alternative implementation of the Phase-2 sidecar fetch path that never had code committed. With the hybrid SIMD StencilAccessor stack also gone (commit 79004aa3), the speculative halo design is no longer in scope for this branch. - Delete util/HaloStencilAccessor.md (337 lines). - WenoStencil.md (live): drop "halo-based fetch" example in §2.4 and the §8 cross-reference bullet. - StencilAccessor.md, BatchAccessor.md (already obsolete; full sweep deferred): remove only the Halo-specific bullets so nothing in the tree references HaloStencilAccessor anymore. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Efty Sifakis --- nanovdb/nanovdb/util/BatchAccessor.md | 14 +- nanovdb/nanovdb/util/HaloStencilAccessor.md | 337 -------------------- nanovdb/nanovdb/util/StencilAccessor.md | 7 - nanovdb/nanovdb/util/WenoStencil.md | 12 +- 4 files changed, 5 insertions(+), 365 deletions(-) delete mode 100644 nanovdb/nanovdb/util/HaloStencilAccessor.md diff --git a/nanovdb/nanovdb/util/BatchAccessor.md b/nanovdb/nanovdb/util/BatchAccessor.md index 88f15a04af..6f7fa5df60 100644 --- a/nanovdb/nanovdb/util/BatchAccessor.md +++ b/nanovdb/nanovdb/util/BatchAccessor.md @@ -1232,7 +1232,7 @@ the perf numbers in this section change when it is toggled. `LeafNode::getValue(offset)` in NanoVDB proper.** Rewriting that function (perhaps ~15 lines, preserving semantics for OFF voxels via a branchless arithmetic gate) would give every stencil-gather caller — - Legacy, hybrid, HaloStencilAccessor, any future variant — a 2–3× speedup + Legacy, hybrid, any future variant — a 2–3× speedup on CPU. Proposed form, sketched below, keeps OFF-returns-0 semantics: ```cpp @@ -1252,17 +1252,7 @@ the perf numbers in this section change when it is toggled. unpredictable one. Needs benchmarking to confirm the optimiser doesn't refold it into a conditional jump.) -3. **HaloStencilAccessor's value proposition is validated but smaller than - advertised.** Its core architectural advantage (precomputed uint64 - indices per tap position, so stencil queries are unconditional indexed - loads) naturally eliminates the `isOn` branch. But a branchless - `LeafNode::getValue` would capture most of the same win without needing - the halo-buffer infrastructure. The halo still wins on absolute perf - (zero per-tap work at query time), but the delta over a branchless - leaf lookup is more like ~0.5–1 ns/voxel than the "sub-2 ns/voxel - territory" framed earlier. - -4. **The hybrid `StencilAccessor`'s design rationale needs a small rewrite.** +3. **The hybrid `StencilAccessor`'s design rationale needs a small rewrite.** The shipped hybrid design (§8i) is still the right API choice (Simd-free public surface, compiler-portable perf) — but the justification is not "it beats the gather chain's L1 pressure" (there is none); it is "it diff --git a/nanovdb/nanovdb/util/HaloStencilAccessor.md b/nanovdb/nanovdb/util/HaloStencilAccessor.md deleted file mode 100644 index 2a99d4f15c..0000000000 --- a/nanovdb/nanovdb/util/HaloStencilAccessor.md +++ /dev/null @@ -1,337 +0,0 @@ -# HaloStencilAccessor — Design Document - -## §1 Motivation - -`StencilAccessor` (see `StencilAccessor.md`) was measured at **537 cycles/voxel** -for the index-gathering phase alone (W=16, `Simd`, rdtsc on i9-285K). -The root cause is structural, not a tuning issue: - -- 18 taps × `Simd` = 18 YMM registers needed simultaneously for - `mIndices[]`, but x86 has only 16 YMM registers available. -- The compiler spills every tap slot to memory → every `mIndices[I]` write is a - store, every subsequent read is a load. -- Additionally, the gather step (reading float values from the sidecar using the - computed indices) is a second pass of scattered 64-bit indexed loads. - -**HaloStencilAccessor** eliminates both problems by replacing the -index-buffer-then-gather pattern with a local dense halo buffer that is filled -once per center-leaf run, from which all stencil values are extracted via -sequential, gather-free SIMD operations. - ---- - -## §2 Key idea: dense halo buffer - -For a given center leaf `L`, densify the sidecar float data for `L` and its -6 axis-aligned face-neighbor leaves into a contiguous local array: - -``` -float buf[16][16][16] -``` - -The center leaf occupies positions `[R..R+7]` in each dimension (where `R` is -the stencil radius cap, see §3). Any stencil tap at compile-time offset -`(di, dj, dk)` from center voxel `(i, j, k)` is then: - -```cpp -buf[R + i + di][R + j + dj][R + k + dk] -``` - -This is a branch-free, uniform address expression valid for **any** tap with -`|di|, |dj|, |dk| ≤ R`. No tree traversal, no ValueOnIndex arithmetic, no -leaf-pointer lookup occurs inside the stencil extraction loop. - ---- - -## §3 Why R=4 and 16³ - -For a center leaf of 8×8×8 voxels, supporting stencils up to radius R requires -`(8 + 2R)` voxels per dimension: - -| R | buffer side | buffer size | L1 resident? | -|---|-------------|-------------|--------------| -| 1 (box 3³) | 10³ | ~4 KB | yes | -| 3 (WENO5) | 14³ | ~11 KB | yes | -| **4 (cap)** | **16³** | **16 KB** | **yes** | -| 5 | 18³ | ~23 KB | yes | - -`R = 4` is chosen because: - -1. **16 KB fits comfortably in L1** (P-core L1d = 48 KB). The buffer stays - L1-resident throughout the processing of an entire center-leaf run. -2. **16 = 2³ × 2 → trivially simple addressing.** No `bi/ii` split; a flat - `[16][16][16]` array indexed by `R + i + di` (a 4-bit quantity, 0–15). -3. **Covers both WENO5 (R=3) and any future stencil up to R=4** without - redesign. -4. Powers-of-two strides (256, 16, 1) admit bit-shift addressing. - -**Axis-aligned stencils never access corner or edge neighbor slots.** For -WENO5, taps move in exactly one axis → only the 6 face-neighbor leaf regions -of the 16³ buffer are ever read. Corner/edge slots may be left zero-initialized -(background value) and are never consumed. - -### Buffer population - -Slots that are populated from the sidecar: - -| Region | Voxels | Source | -|--------|--------|--------| -| Center leaf `[R..R+7]³` | 8³ = 512 | sidecar[center leaf] | -| −x face slab `[0..R-1][R..R+7][R..R+7]` | R×8×8 = 256 | sidecar[−x neighbor] | -| +x face slab `[R+8..15][R..R+7][R..R+7]` | 256 | sidecar[+x neighbor] | -| −y, +y, −z, +z face slabs | 256 each | sidecar[respective neighbors] | - -**Total sidecar reads per center-leaf run: 512 + 6×256 = 2,048 floats = 8 KB.** -Corner and edge slots are zero-initialized once at buffer allocation. - ---- - -## §4 Run-based outer loop - -Within a VBM block (128 voxels), the `leafIndex[]` array produced by -`decodeInverseMaps` is sorted (the VBM is built in leaf-traversal order). -Voxels belonging to the same center leaf therefore form **contiguous runs**. - -A narrow-band block of 128 voxels sitting inside a single 8³ leaf spans at -most 1–3 distinct center leaves per block in practice. - -``` -for each VBM block: - scan leafIndex[0..127] for run boundaries - for each run (center leaf L, voxels i_start..i_end): - populate buf[16][16][16] from sidecar // 8 KB, amortized over run - process all voxels in [i_start..i_end] via the slice pipeline (§6) -``` - -The buffer fill and the 7 neighbor-leaf pointer lookups are amortized across -all voxels in the run. - ---- - -## §5 Stencil extraction: the fill → transpose → compact → transpose pipeline - -### §5a Why array-of-stencils [32][512] is easy to fill from buf[] - -For tap `t = (di, dj, dk)`, the 512 values `stencil[t][v]` for all center-leaf -voxel positions `v = i*64 + j*8 + k` are produced by iterating over the 64 -z-rows (fixed `i`, `j`; `k` = 0..7): - -``` -for each z-row (i, j): - source = &buf[R+i+di][R+j+dj][R+dk] // 8 consecutive floats in buf - dest = &stencil[t][i*64 + j*8] // 8 consecutive floats in row t - store 8 floats (one YMM) -``` - -Both source (L1-resident `buf`) and destination (stencil row `t`) are accessed -sequentially — **zero gathers**. All 18 WENO5 taps fill sequentially; the 14 -padding slots (see §5b) are either zero-initialized once or left unused. - -### §5b Padding taps to 32 - -WENO5 has 18 taps. Padding to 32 (next power of 2) gives: - -- Each voxel row in `stencil[*][32]` is exactly **128 bytes = 2 cache lines**, - naturally aligned. -- Compaction of one row (§5d) is exactly **4 full YMM loads + 4 full YMM - stores** — no masking, no partial registers, no scalar tail. -- `stencil[v]` address = `base + (v << 7)`: a single shift, no multiply. -- 14 padding slots are never read during WENO5 arithmetic and cost nothing. - -### §5c Layout duality and the 8×8 in-register transpose - -Two layouts are needed at different pipeline stages: - -| Layout | Alias | Size | Best for | -|--------|-------|------|----------| -| `float[32][N]` | SoA | 18/32 contiguous values per tap | sequential fill from buf | -| `float[N][32]` | AoS | 32 contiguous values per voxel | WENO5 arithmetic, compaction | - -Converting between them: the standard **8×8 SIMD float block transpose**. - -Given 8 YMM registers (one row each = 8 floats), the 8×8 transpose applies a -fixed 24-instruction register-only shuffle network -(`vunpcklps`/`vunpckhps` → `vunpcklpd`/`vunpckhpd` → `vperm2f128`) -and stores 8 result YMM registers. All shuffles are register-to-register; -no intermediate memory is touched between the 8 loads and 8 stores. - -For an M×N matrix (both M and N multiples of 8): -- Number of 8×8 block transposes: (M/8) × (N/8) -- Uses exactly 16 YMM registers (8 input + 8 output) — exact fit - -### §5d Compaction: AoS[512][32] → AoS[N_active][32] - -Given `stencil[512][32]` (AoS, all leaf positions) and the `voxelOffset[]` -list of N_active active voxels (0–511, from `decodeInverseMaps`): - -``` -for v in 0..N_active: - ymm0..3 = load stencil[voxelOffset[v]][0..31] // 4 × YMM load - store compact[v][0..31] // 4 × YMM store -``` - -Total: N_active × 8 YMM operations. Rows are 128-byte aligned; no masking. - -### §5e Slicing: keep the working set in L1 - -`stencil[32][512]` = 64 KB — L2 resident. Instead, process 4 slices of 128 -voxels each: - -``` -stencil[32][128] = 32 × 128 × 4 = 16 KB ← L1 resident -``` - -With the 16³ `buf` (16 KB) and the stencil slice (16 KB), the total working -set is **32 KB**, well within L1 (48 KB). All fill and transpose phases stay -in L1 — no L2 traffic during data transformation. - -Slice boundaries: voxel positions 0–127, 128–255, 256–383, 384–511 within the -center leaf. Each slice is processed identically; active-voxel results are -accumulated across slices. - ---- - -## §6 Per-slice pipeline (full detail) - -For each of the 4 slices `[s*128 .. (s+1)*128 - 1]`: - -``` -Step 1 Fill stencil[32][128] - ── For each tap t in 0..17: - For each z-row (i,j) with voxels in this slice: - load 8 floats from buf (L1) → store to stencil[t][row] (L1) - ── Cost: 9 KB reads + 9 KB writes, all L1. - -Step 2 Transpose [32][128] → [128][32] - ── 64 × 8×8 in-register block transposes. - ── Cost: 16 KB L1 read + 16 KB L1 write; all shuffles register-only. - -Step 3 Compact [128][32] → [N_slice][32] - ── For each active voxel in this slice: 4 YMM loads + 4 YMM stores (L1). - ── N_slice ≤ 128. Cost: ≤ 16 KB L1 read + ≤ 16 KB L1 write. - -Step 4 Transpose [N_slice][32] → [32][N_slice] - ── ≤ 64 × 8×8 in-register block transposes. - ── Cost: ≤ 16 KB L1 read + write. - -Step 5 WENO5 arithmetic on stencil[32][N_slice] - ── For each of the 18 taps: sequential load of N_slice floats (L1). - ── ~700 FLOPs/voxel, vectorised over N_slice voxels in YMM batches. - ── Cost: ~700 × N_slice / 32 cycles (2 FMA units × 8 floats). - -Step 6 Write output - ── N_slice scalar (or YMM-masked) stores to output sidecar. -``` - ---- - -## §7 Performance analysis - -### Measured baseline (StencilAccessor, W=16) - -- **8,586 TSC ticks/batch** (16 voxels) → **537 cycles/voxel** -- Gather phase only; WENO5 arithmetic not yet included. -- Root cause: register spilling of 18 × `Simd`. - -### CPU parameters (i9-285K, TSC reference clock) - -| Resource | Throughput | -|----------|-----------| -| L1 read | 64 bytes/cycle (~237 GB/s at 3.7 GHz) | -| L1 write | 32 bytes/cycle (~118 GB/s) | -| L2 read | 32 bytes/cycle (~118 GB/s) | -| FMA peak | 32 FLOPs/cycle (2 units × 8 floats × 2 FLOPs) | - -### Estimated cost per slice (128 voxels, ~32 active) - -| Step | Data touched | Estimated cycles | -|------|-------------|-----------------| -| 1 — fill stencil[32][128] | 9 KB L1 write | ~288 | -| 2 — transpose [32][128]→[128][32] | 16 KB L1 r+w | ~512 | -| 3 — compact | 16 KB L1 r+w | ~512 | -| 4 — transpose compact output | ≤16 KB L1 r+w | ~256 | -| 5 — WENO5 arithmetic (32 active) | 18×32×4=2 KB L1 read | ~700 | -| **Total per slice** | | **~2,268** | - -Per voxel (32 active): ~71 cycles/voxel — including full WENO5 arithmetic. - -Four slices + buffer fill (~300 cycles amortised): ~9,372 cycles per 128-voxel -block → ~**73 cycles/voxel total**. - -### Comparison - -| Metric | StencilAccessor | HaloStencilAccessor (est.) | -|--------|-----------------|---------------------------| -| Gather phase only | 537 cycles/voxel | ~50 cycles/voxel | -| Gather + WENO5 | not measured | ~73 cycles/voxel | -| Dominant bottleneck | register spilling | L1 write bandwidth | -| Gathers in hot loop | yes (scattered 64-bit) | **none** | - -**Estimated speedup over StencilAccessor: 7–10× on the gather phase; -gather + WENO5 combined comes in under the cost of gathering alone in the -old design.** - ---- - -## §8 Future optimisations - -### §8a Fuse fill + first transpose (steps 1+2) - -Fill 8 tap rows × 8 voxels into YMM registers, immediately transpose the -8×8 block and store in AoS order. Eliminates one full pass over the 16 KB -stencil slice; saves ~512 cycles per slice. - -### §8b Fuse WENO5 arithmetic with step 4 - -Rather than materialising `stencil[32][N_slice]` (SoA), compute WENO5 -directly from `stencil[N_slice][32]` (AoS) by loading each voxel's 32-float -row and computing vertically. Eliminates step 4 entirely. Effective when -N_slice is small (dense run ≤ 32 voxels). - -### §8c Software pipelining across slices - -While WENO5 runs on slice `s`, fill the stencil buffer for slice `s+1`. -The two phases touch disjoint L1 regions; overlap is feasible. - -### §8d TBB parallel_for over blocks - -VBM blocks are independent (grid is read-only). Each thread owns its -block's 32 KB working set (buf + slice); no synchronisation required. -Expected 7–8× speedup across 8 P-cores. - ---- - -## §9 Design decisions summary - -| Decision | Choice | Rationale | -|----------|--------|-----------| -| Stencil radius cap | R = 4 | 16³ = 16 KB, L1 resident; power-of-2 | -| Tap count padding | 18 → 32 | YMM-aligned compaction (4 full registers) | -| Dense or sparse fill | Dense (all 512 leaf positions) | Branchless; cheaper than compaction logic during fill | -| Slice size | 128 voxels (4 slices of 512) | buf(16 KB) + slice(16 KB) = 32 KB ≤ L1 | -| Transpose kernel | 8×8 in-register float block | 16 YMM registers, no memory between load/store | -| Compaction order | After first transpose | Driven by sorted voxelOffset[] from decodeInverseMaps | -| Outer loop | Run-based (by center leaf) | Amortises buffer fill over entire run | - ---- - -## §10 Open questions - -1. **Leaf-pointer resolution**: buffer fill still requires resolving 6 face-neighbor - leaf pointers via tree traversal. Should this reuse BatchAccessor's - neighbor-lookup machinery, or be a standalone 6-pointer lookup? - -2. **Missing neighbors**: if a face-neighbor leaf does not exist in the grid, - the corresponding slab should be zero-filled (background = 0 for - ValueOnIndex grids). Confirm zero-init strategy for absent neighbors. - -3. **Non-uniform active-voxel density**: some slices may have 0 active voxels - (entire slice inactive). Add a slice-skip predicate? - -4. **Output sidecar write-back**: the `voxelOffset` of each active voxel gives - its ValueOnIndex; use that to write the WENO5 result directly to the output - sidecar. Confirm index arithmetic. - -5. **Tap padding slots (18..31)**: never read in WENO5 arithmetic. Can be - left uninitialised (no UB since never read) or zero-filled once. Decide - at implementation time. diff --git a/nanovdb/nanovdb/util/StencilAccessor.md b/nanovdb/nanovdb/util/StencilAccessor.md index c228644314..8d9b636f04 100644 --- a/nanovdb/nanovdb/util/StencilAccessor.md +++ b/nanovdb/nanovdb/util/StencilAccessor.md @@ -528,13 +528,6 @@ because both pay the same dominant `isOn` mispredict cost. `NANOVDB_USE_BRANCHY_GETVALUE` to restore the old branchy form. End-to-end 1.4× on realistic narrow-band workloads, 2.8× on random-access. -- **HaloStencilAccessor's value proposition is validated but narrower**: - its precomputed uint64 index buffer naturally eliminates `isOn` - branches by never evaluating them. Now that the branchless - `getValue` captures the same win cheaply, the halo's remaining - advantage is "zero per-tap work at query time" rather than "avoids - the isOn mispredict storm." Worth building for the absolute-perf - cases; less urgent than previously framed. See `BatchAccessor.md` §8j for the original measurement matrix and correction log (§8g/§8h/§8i), and `BatchAccessor.md` §8k for the diff --git a/nanovdb/nanovdb/util/WenoStencil.md b/nanovdb/nanovdb/util/WenoStencil.md index 590b933d2c..a68c88a335 100644 --- a/nanovdb/nanovdb/util/WenoStencil.md +++ b/nanovdb/nanovdb/util/WenoStencil.md @@ -116,10 +116,9 @@ For the CPU SIMD case that's typically a pair of stack-local no intermediate buffer is needed at all. This preserves the arithmetic class's purity and gives callers flex- -ibility — a different Phase-2 path (e.g. a halo-based fetch, or a -future hardware-gather fill) can populate the stencil using whatever -pattern fits, without the class having to expose a "fill API" that -bakes in one shape. +ibility — a different Phase-2 path (e.g. a future hardware-gather +fill) can populate the stencil using whatever pattern fits, without +the class having to expose a "fill API" that bakes in one shape. --- @@ -457,11 +456,6 @@ exists. gather). `StencilAccessor` fills `mIndices[SIZE][W]`; callers consume those indices (via `sidecar[idx]` in their fill loops) and populate `WenoStencil::values[]` / `isActive[]`. -- **`HaloStencilAccessor.md`** — speculative alternative that - precomputes a dense float halo buffer; if that path is pursued, - `WenoStencil` would fill from the halo instead of from sidecar - indices. The extrapolation and normSqGrad algorithms here transfer - unchanged. - **`nanovdb/math/Stencils.h`** — the scalar ground-truth for WENO5 and Godunov. `WenoStencil::normSqGrad()` is a line-for-line transliteration of `nanovdb::math::WenoStencil::normSqGrad()` From 4565d482b73cca93a7b1d0daff053cdb4f879d29 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Mon, 27 Apr 2026 17:23:53 -0500 Subject: [PATCH 55/60] nanovdb/util/Simd.h: remove unused gather and popcount The hybrid StencilAccessor/BatchAccessor stack was the only consumer of util::gather, util::gather_if, util::popcount, and util::popcount64; all four became dead code when that stack was removed (commit 79004aa3). Verified no live caller in the tree (the disabled examples didn't use them either) and that ex_weno_nanovdb_cpu's WenoStencil<16> path is syntax-clean against the trimmed header. - Drop the three gather overloads (unmasked / masked / merge-masked) from both backends and the scalar fallback path. - Drop popcount64 plus the lane-wise popcount and scalar popcount. - Update the simd_cast doc comment to drop the "when building gather indices" justification (widening between integer types remains). Net: -104 lines (562 -> 458). Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Efty Sifakis --- nanovdb/nanovdb/util/Simd.h | 114 ++---------------------------------- 1 file changed, 5 insertions(+), 109 deletions(-) diff --git a/nanovdb/nanovdb/util/Simd.h b/nanovdb/nanovdb/util/Simd.h index c73c15f8be..00bebd3a29 100644 --- a/nanovdb/nanovdb/util/Simd.h +++ b/nanovdb/nanovdb/util/Simd.h @@ -134,39 +134,6 @@ inline void store(Simd v, T* p, element_aligned_tag = {}) { v.copy_to(p, element_aligned); } -// Unmasked gather: result[i] = ptr[idx[i]] for all lanes. -// IdxT may be int32_t or int64_t; the compiler selects the matching hardware -// instruction (vpgatherdps/vpgatherdq for 32-bit idx, vpgatherqq for 64-bit idx). -template -inline Simd gather(const T* __restrict__ ptr, Simd idx) { - return Simd([&](int i) { return ptr[idx[i]]; }); -} - -// Masked gather: result[i] = mask[i] ? ptr[idx[i]] : fallback. -// Implemented as a full gather + where-blend; ptr is accessed for ALL lanes, -// so every idx[i] must be a valid offset regardless of mask[i]. -template -inline Simd gather(SimdMask mask, const T* __restrict__ ptr, - Simd idx, T fallback = T(0)) { - auto result = Simd(fallback); - stdx::where(mask, result) = Simd([&](int i) { return ptr[idx[i]]; }); - return result; -} - -// Merge-masked gather: dst[i] = mask[i] ? ptr[idx[i]] : dst[i] (unchanged). -// MaskElemT may differ from T (heterogeneous mask, e.g. SimdMask -// applied to Simd data). When T==MaskElemT, delegates directly -// to stdx::where; otherwise uses the WhereExpression boolean round-trip. -template -inline void gather_if(Simd& dst, SimdMask mask, - const T* __restrict__ ptr, Simd idx) { - if constexpr (std::is_same_v) { - stdx::where(mask, dst) = Simd([&](int i) { return ptr[idx[i]]; }); - } else { - where(mask, dst) = Simd([&](int i) { return ptr[idx[i]]; }); - } -} - // =========================================================================== // Implementation B: std::array backend (default) // =========================================================================== @@ -346,44 +313,16 @@ __hostdev__ void store(Simd v, T* p, element_aligned_tag = {}) { v.store(p); } -// Unmasked gather: result[i] = ptr[idx[i]] for all lanes. -template -__hostdev__ Simd gather(const T* __restrict__ ptr, Simd idx) { - Simd r; - for (int i = 0; i < W; i++) r[i] = ptr[idx[i]]; - return r; -} - -// Masked gather: result[i] = mask[i] ? ptr[idx[i]] : fallback. -// Scalar path: accesses ptr only for true lanes (ternary short-circuits). -template -__hostdev__ Simd gather(SimdMask mask, const T* __restrict__ ptr, - Simd idx, T fallback = T(0)) { - Simd r; - for (int i = 0; i < W; i++) r[i] = mask[i] ? ptr[idx[i]] : fallback; - return r; -} - -// Merge-masked gather: dst[i] = mask[i] ? ptr[idx[i]] : dst[i] (unchanged). -// MaskElemT may differ from T (heterogeneous mask). -// Scalar path: only accesses ptr for true lanes. -template -__hostdev__ void gather_if(Simd& dst, SimdMask mask, - const T* __restrict__ ptr, Simd idx) { - for (int i = 0; i < W; i++) - if (mask[i]) dst[i] = ptr[idx[i]]; -} - #endif // NANOVDB_USE_STD_SIMD // --------------------------------------------------------------------------- // simd_cast — element-wise static_cast between Simd types of the same W. // -// Used for widening (uint16_t → uint32_t, uint32_t → uint64_t) and for -// reinterpreting signedness (uint32_t → int32_t) when building gather indices. -// Both backends: the array backend uses a lane loop; the stdx backend uses the -// generator constructor, which the compiler lowers to a vpmovsxbw / vpmovzxwd -// sequence or similar sign/zero-extend instruction depending on the types. +// Used for widening between integer element types (uint16_t → uint32_t, +// uint32_t → uint64_t). Both backends: the array backend uses a lane loop; +// the stdx backend uses the generator constructor, which the compiler lowers +// to a vpmovsxbw / vpmovzxwd sequence or similar sign/zero-extend instruction +// depending on the types. // Scalar overload: degrades to static_cast for plain scalar types. // --------------------------------------------------------------------------- template @@ -420,39 +359,6 @@ __hostdev__ void simd_cast_if(DstT& dst, bool mask, SrcT src) { if (mask) dst = static_cast(src); } -// --------------------------------------------------------------------------- -// popcount64 — scalar SWAR popcount, always uses arithmetic (no __builtin_popcountll). -// -// Safe to call per-lane inside a vectorizable loop: every operation (>>, &, +, -) -// maps to an AVX2 instruction for 64-bit elements (vpsrlq, vpand, vpaddq, vpsubq). -// The final byte-sum uses a shift-and-add tree instead of the multiply trick -// (v * 0x0101...) since 64x64->64 multiply has no AVX2 equivalent (vpmullq is AVX-512). -// --------------------------------------------------------------------------- -__hostdev__ inline uint64_t popcount64(uint64_t v) -{ - v -= (v >> 1) & uint64_t(0x5555555555555555); - v = (v & uint64_t(0x3333333333333333)) + ((v >> 2) & uint64_t(0x3333333333333333)); - v = (v + (v >> 4)) & uint64_t(0x0F0F0F0F0F0F0F0F); // per-byte counts - v += v >> 8; v &= uint64_t(0x00FF00FF00FF00FF); - v += v >> 16; v &= uint64_t(0x0000FFFF0000FFFF); - v += v >> 32; - return v & uint64_t(63); -} - -// Lane-wise SIMD popcount: applies popcount64 to every lane. -// Backend A: generator constructor; Backend B: element loop (auto-vectorized by GCC/Clang). -template -__hostdev__ Simd popcount(Simd v) { -#ifdef NANOVDB_USE_STD_SIMD - return Simd([&](int i) { return popcount64(v[i]); }); -#else - Simd r; - for (int i = 0; i < W; ++i) r[i] = popcount64(v[i]); - return r; -#endif -} -__hostdev__ inline uint64_t popcount(uint64_t v) { return popcount64(v); } - // --------------------------------------------------------------------------- // simd_traits — generic per-lane access for scalar and Simd types. // @@ -548,15 +454,5 @@ __hostdev__ ScalarWhereProxy where(bool mask, T& target) { return {mask, target}; } -// Unmasked scalar gather: result = ptr[idx]. -template -__hostdev__ T gather(const T* __restrict__ ptr, IdxT idx) { return ptr[idx]; } - -// Merge-masked scalar gather: dst = ptr[idx] only if mask, else dst unchanged. -template -__hostdev__ void gather_if(T& dst, bool mask, const T* __restrict__ ptr, IdxT idx) { - if (mask) dst = ptr[idx]; -} - } // namespace util } // namespace nanovdb From bc0c853d9738076fd44499a96f5f84a4b72c0684 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Tue, 28 Apr 2026 00:54:06 -0500 Subject: [PATCH 56/60] nanovdb/util/Simd.h: cleanup pass Trim dead code, add value_type/mask_type/simd_type/size(), copy_from/ copy_to, compound assigns, full comparison set. Wrap types in nested nanovdb::util::experimental namespace. Move Min/Max/Select to nanovdb::math (scalar Select added in math/Math.h). Rename NANOVDB_NO_STD_SIMD -> NANOVDB_SIMD_ARRAY_BACKEND, NANOVDB_USE_STD_SIMD -> NANOVDB_USE_STDX_SIMD. Add doxygen header. Tighten comments. WenoStencil and ex_weno_nanovdb_cpu updated to track the new namespace and Min/Max/Select location. Bit-exact on the taperLER smoke test. Signed-off-by: Efty Sifakis --- .../ex_weno_nanovdb_cpu/weno_nanovdb_cpu.cpp | 10 +- nanovdb/nanovdb/math/Math.h | 5 + nanovdb/nanovdb/util/Simd.h | 420 +++++------------- nanovdb/nanovdb/util/WenoStencil.h | 30 +- 4 files changed, 147 insertions(+), 318 deletions(-) diff --git a/nanovdb/nanovdb/examples/ex_weno_nanovdb_cpu/weno_nanovdb_cpu.cpp b/nanovdb/nanovdb/examples/ex_weno_nanovdb_cpu/weno_nanovdb_cpu.cpp index 15f1b83800..8def5c3dc2 100644 --- a/nanovdb/nanovdb/examples/ex_weno_nanovdb_cpu/weno_nanovdb_cpu.cpp +++ b/nanovdb/nanovdb/examples/ex_weno_nanovdb_cpu/weno_nanovdb_cpu.cpp @@ -276,8 +276,8 @@ runFast(const IndexGridT& index nanovdb::WenoStencil stencil(dx); constexpr int SIZE = nanovdb::WenoStencil::size(); - using FloatV = nanovdb::util::Simd ; - using MaskV = nanovdb::util::SimdMask; + using FloatV = nanovdb::util::experimental::Simd ; + using MaskV = nanovdb::util::experimental::SimdMask; // One LegacyStencilAccessor per TBB task (one ReadAccessor). LegacyAccT legacyAcc(indexGrid); @@ -328,8 +328,8 @@ runFast(const IndexGridT& index // -------- Load: per-tap SIMD load into stencil view -------- for (int k = 0; k < SIZE; ++k) { - stencil.values [k] = FloatV(raw_values[k], nanovdb::util::element_aligned); - stencil.isActive[k] = MaskV (raw_active[k], nanovdb::util::element_aligned); + stencil.values [k] = FloatV(raw_values[k], nanovdb::util::experimental::element_aligned); + stencil.isActive[k] = MaskV (raw_active[k], nanovdb::util::experimental::element_aligned); } // -------- Phase-3 arithmetic (in-place on Simd values) -------- @@ -338,7 +338,7 @@ runFast(const IndexGridT& index // -------- Simd -> scalar bridge + per-lane store -------- alignas(64) float result_lanes[SIMDw]; - nanovdb::util::store(result, result_lanes, nanovdb::util::element_aligned); + result.copy_to(result_lanes, nanovdb::util::experimental::element_aligned); for (int i = 0; i < SIMDw; ++i) { const int p = batchStart + i; if (leafIndex[p] == CPUVBM::UnusedLeafIndex) continue; diff --git a/nanovdb/nanovdb/math/Math.h b/nanovdb/nanovdb/math/Math.h index 19b1beee81..a3ee0df993 100644 --- a/nanovdb/nanovdb/math/Math.h +++ b/nanovdb/nanovdb/math/Math.h @@ -172,6 +172,11 @@ __hostdev__ inline double Max(double a, double b) { return fmax(a, b); } +template +__hostdev__ inline T Select(bool m, T a, T b) +{ + return m ? a : b; +} __hostdev__ inline float Clamp(float x, float a, float b) { return Max(Min(x, b), a); diff --git a/nanovdb/nanovdb/util/Simd.h b/nanovdb/nanovdb/util/Simd.h index 00bebd3a29..666acab0e4 100644 --- a/nanovdb/nanovdb/util/Simd.h +++ b/nanovdb/nanovdb/util/Simd.h @@ -1,48 +1,60 @@ +// Copyright Contributors to the OpenVDB Project +// SPDX-License-Identifier: Apache-2.0 + +/*! + \file nanovdb/util/Simd.h + + \author Efty Sifakis + + \date April 28, 2026 + + \brief Minimal SIMD abstraction for NanoVDB stencil kernels. +*/ + #pragma once #include #include #include // __hostdev__ -// Minimal SIMD abstraction for NanoVDB stencil kernels. -// // Two implementations, selected automatically at compile time: // -// NANOVDB_USE_STD_SIMD (set when is available): -// Simd and SimdMask are pure type aliases for +// stdx backend (default, when is available): +// Simd and SimdMask are aliases for // std::experimental::fixed_size_simd / fixed_size_simd_mask. -// All arithmetic delegates to the standard type; the compiler emits -// native vector instructions without relying on the auto-vectorizer. +// Internal flag: NANOVDB_USE_STDX_SIMD. // -// Default (std::array backend): +// std::array backend (forced via NANOVDB_SIMD_ARRAY_BACKEND, or when +// is unavailable): // Simd wraps std::array with element-wise operator loops. -// Clang auto-vectorizes these loops; GCC does not. // -// In both cases the interface is identical, so templated kernels (T=float -// for GPU, T=Simd for CPU) compile unmodified. -// -// Mirrors the C++26 std::simd naming — migration will be a typedef swap. +// The interface is identical in both cases, so templated kernels +// (T=float for GPU, T=Simd for CPU) compile unmodified. // --------------------------------------------------------------------------- // Auto-detect std::experimental::simd (Parallelism TS v2) // --------------------------------------------------------------------------- -#if !defined(NANOVDB_NO_STD_SIMD) && defined(__has_include) && __has_include() +#if !defined(NANOVDB_SIMD_ARRAY_BACKEND) && defined(__has_include) && __has_include() # include # ifdef __cpp_lib_experimental_parallel_simd -# define NANOVDB_USE_STD_SIMD 1 +# define NANOVDB_USE_STDX_SIMD 1 # endif #endif namespace nanovdb { namespace util { -// --------------------------------------------------------------------------- -// element_aligned_tag — portable load/store alignment descriptor. -// In the stdx backend this is an alias for stdx::element_aligned_tag so that -// nanovdb::util::element_aligned is the same token stdx constructors expect. -// In the std::array backend it is a standalone dummy struct (ignored). -// --------------------------------------------------------------------------- -#ifdef NANOVDB_USE_STD_SIMD +// =========================================================================== +// nanovdb::util::experimental — internal SIMD primitives. Names in this +// nested namespace are unstable API by convention; external callers should +// not depend on them. +// =========================================================================== +namespace experimental { + +// element_aligned_tag — load/store alignment descriptor. Aliases +// stdx::element_aligned_tag in the stdx backend; empty struct in the +// array backend. +#ifdef NANOVDB_USE_STDX_SIMD namespace stdx = std::experimental; using element_aligned_tag = stdx::element_aligned_tag; #else @@ -53,7 +65,7 @@ inline constexpr element_aligned_tag element_aligned{}; // =========================================================================== // Implementation A: std::experimental::simd — pure type aliases // =========================================================================== -#ifdef NANOVDB_USE_STD_SIMD +#ifdef NANOVDB_USE_STDX_SIMD template using SimdMask = stdx::fixed_size_simd_mask; @@ -61,100 +73,33 @@ using SimdMask = stdx::fixed_size_simd_mask; template using Simd = stdx::fixed_size_simd; -// NANOVDB_FORCEINLINE (see Util.h) forces these thin wrappers to inline -// at every call site. Without it, GCC's cost model sometimes outlines -// them — each call then pays a function-call + vzeroupper + register- -// ABI transition that dominates the one-instruction body -// (vminps / vmaxps / vblendvps). See BatchAccessor.md §8h for the -// analogous fix on the StencilAccessor path. -template -NANOVDB_FORCEINLINE Simd min(Simd a, Simd b) { return stdx::min(a, b); } - -template -NANOVDB_FORCEINLINE Simd max(Simd a, Simd b) { return stdx::max(a, b); } - -// TS v2 where(mask, v) is a masked assignment proxy, not a 3-arg select. -// Wrap it into the select(mask, a, b) form our kernels expect. -template -NANOVDB_FORCEINLINE Simd where(SimdMask mask, Simd a, Simd b) { - auto result = b; - stdx::where(mask, result) = a; - return result; -} -// Heterogeneous where: mask element type U ≠ value element type T. -// Converts the U-mask to a T-mask via a boolean round-trip. -template -NANOVDB_FORCEINLINE Simd where(SimdMask mask, Simd a, Simd b) { - bool arr[W]; - for (int i = 0; i < W; i++) arr[i] = static_cast(mask[i]); - SimdMask tmask(arr, element_aligned); - auto result = b; - stdx::where(tmask, result) = a; - return result; -} - -// 2-argument where: stdx-style masked-assignment proxy. -// where(mask, target) = value writes value[i] into target[i] for lanes where mask[i] is true. -// Heterogeneous mask (mask element type U may differ from value element type T). -// stdx::fixed_size_simd operator[] returns by value, so the assignment delegates to -// a boolean round-trip + stdx::where rather than a per-lane scalar store. -template -struct WhereExpression { - const SimdMask& mask; - Simd& target; - WhereExpression& operator=(const Simd& value) { - bool arr[W]; - for (int i = 0; i < W; ++i) arr[i] = static_cast(mask[i]); - SimdMask tmask(arr, element_aligned); - stdx::where(tmask, target) = value; - return *this; - } -}; -template -inline WhereExpression where(const SimdMask& mask, Simd& target) { - return {mask, target}; -} - -// Horizontal reduction: delegates to stdx::reduce. -// Mirrors std::experimental::reduce(v, binary_op) — same signature, same semantics. -// Use with std::bit_or<>{}, std::bit_and<>{}, std::plus<>{}, etc. -template -inline T reduce(Simd v, BinaryOp op) { return stdx::reduce(v, op); } - -template -inline bool any_of(SimdMask m) { return stdx::any_of(m); } -template -inline bool none_of(SimdMask m) { return stdx::none_of(m); } -template -inline bool all_of(SimdMask m) { return stdx::all_of(m); } - -// Store W lanes of v into p[0..W-1] (stdx calls this copy_to). -template -inline void store(Simd v, T* p, element_aligned_tag = {}) { - v.copy_to(p, element_aligned); -} - // =========================================================================== // Implementation B: std::array backend (default) // =========================================================================== #else +template struct Simd; // fwd-decl so SimdMask::simd_type can name it + template struct SimdMask { + using value_type = bool; + using simd_type = Simd; + static constexpr size_t size() { return size_t(W); } + std::array data{}; SimdMask() = default; - __hostdev__ explicit SimdMask(const bool* p, element_aligned_tag) { + __hostdev__ SimdMask(bool b) { data.fill(b); } // broadcast + __hostdev__ explicit SimdMask(const bool* p, element_aligned_tag) { // load (ctor) for (int i = 0; i < W; i++) data[i] = p[i]; } - // Converting constructor: copy bool values from a mask over a different element type. - // All SimdMask are boolean arrays of the same width; this allows - // where(SimdMask, Simd, Simd) without explicit casting. - template - __hostdev__ explicit SimdMask(SimdMask const& o) { - for (int i = 0; i < W; i++) data[i] = o[i]; - } __hostdev__ bool operator[](int i) const { return data[i]; } __hostdev__ bool& operator[](int i) { return data[i]; } + __hostdev__ void copy_from(const bool* p, element_aligned_tag) { // load (member) + for (int i = 0; i < W; i++) data[i] = p[i]; + } + __hostdev__ void copy_to(bool* p, element_aligned_tag) const { // store (member) + for (int i = 0; i < W; i++) p[i] = data[i]; + } __hostdev__ SimdMask operator!() const { SimdMask r; for (int i = 0; i < W; i++) r.data[i] = !data[i]; return r; } @@ -168,16 +113,23 @@ struct SimdMask { template struct Simd { + using value_type = T; + using mask_type = SimdMask; + static constexpr size_t size() { return size_t(W); } + std::array data{}; Simd() = default; __hostdev__ Simd(T scalar) { data.fill(scalar); } // broadcast - __hostdev__ explicit Simd(const T* p, element_aligned_tag) { // load + __hostdev__ explicit Simd(const T* p, element_aligned_tag) { // load (ctor) for (int i = 0; i < W; i++) data[i] = p[i]; } __hostdev__ T operator[](int i) const { return data[i]; } __hostdev__ T& operator[](int i) { return data[i]; } - __hostdev__ void store(T* p, element_aligned_tag = {}) const { // store + __hostdev__ void copy_from(const T* p, element_aligned_tag) { // load (member) + for (int i = 0; i < W; i++) data[i] = p[i]; + } + __hostdev__ void copy_to(T* p, element_aligned_tag) const { // store (member) for (int i = 0; i < W; i++) p[i] = data[i]; } __hostdev__ Simd operator-() const { @@ -195,10 +147,29 @@ struct Simd { __hostdev__ Simd operator/(Simd o) const { Simd r; for (int i = 0; i < W; i++) r.data[i] = data[i] / o.data[i]; return r; } + __hostdev__ Simd& operator+=(Simd o) { + for (int i = 0; i < W; i++) data[i] += o.data[i]; return *this; + } + __hostdev__ Simd& operator-=(Simd o) { + for (int i = 0; i < W; i++) data[i] -= o.data[i]; return *this; + } + __hostdev__ Simd& operator*=(Simd o) { + for (int i = 0; i < W; i++) data[i] *= o.data[i]; return *this; + } + __hostdev__ Simd& operator/=(Simd o) { + for (int i = 0; i < W; i++) data[i] /= o.data[i]; return *this; + } + __hostdev__ SimdMask operator<(Simd o) const { + SimdMask m; for (int i = 0; i < W; i++) m.data[i] = data[i] < o.data[i]; return m; + } + __hostdev__ SimdMask operator<=(Simd o) const { + SimdMask m; for (int i = 0; i < W; i++) m.data[i] = data[i] <= o.data[i]; return m; + } __hostdev__ SimdMask operator>(Simd o) const { - SimdMask m; - for (int i = 0; i < W; i++) m.data[i] = data[i] > o.data[i]; - return m; + SimdMask m; for (int i = 0; i < W; i++) m.data[i] = data[i] > o.data[i]; return m; + } + __hostdev__ SimdMask operator>=(Simd o) const { + SimdMask m; for (int i = 0; i < W; i++) m.data[i] = data[i] >= o.data[i]; return m; } __hostdev__ SimdMask operator==(Simd o) const { SimdMask m; for (int i = 0; i < W; i++) m.data[i] = data[i] == o.data[i]; return m; @@ -249,210 +220,63 @@ Simd operator/(T a, Simd b) { return Simd(a) / b; } template __hostdev__ Simd operator/(Simd a, T b) { return a / Simd(b); } -template -__hostdev__ Simd min(Simd a, Simd b) { - Simd r; for (int i = 0; i < W; i++) r[i] = a[i] < b[i] ? a[i] : b[i]; return r; -} -template -__hostdev__ Simd max(Simd a, Simd b) { - Simd r; for (int i = 0; i < W; i++) r[i] = a[i] > b[i] ? a[i] : b[i]; return r; -} -template -__hostdev__ Simd where(SimdMask mask, Simd a, Simd b) { - Simd r; for (int i = 0; i < W; i++) r[i] = mask[i] ? a[i] : b[i]; return r; -} -// Heterogeneous where: mask element type U need not match value element type T. -// Useful for applying PredicateT=SimdMask to VoxelOffsetT=Simd. -template -__hostdev__ Simd where(SimdMask mask, Simd a, Simd b) { - Simd r; for (int i = 0; i < W; i++) r[i] = mask[i] ? a[i] : b[i]; return r; -} - -// 2-argument where: stdx-style masked-assignment proxy. -// where(mask, target) = value writes value[i] into target[i] for lanes where mask[i] is true. -// Heterogeneous mask (mask element type U may differ from value element type T). -template -struct WhereExpression { - const SimdMask& mask; - Simd& target; - __hostdev__ WhereExpression& operator=(const Simd& value) { - for (int i = 0; i < W; ++i) - if (mask[i]) target[i] = value[i]; - return *this; - } -}; -template -__hostdev__ WhereExpression where(const SimdMask& mask, Simd& target) { - return {mask, target}; -} +#endif // NANOVDB_USE_STDX_SIMD -// Horizontal reduction: fold all lanes with a binary operator. -// Mirrors std::experimental::reduce(v, binary_op). -// Use with std::bit_or<>{}, std::bit_and<>{}, std::plus<>{}, etc. -template -__hostdev__ T reduce(Simd v, BinaryOp op) { - T r = v[0]; - for (int i = 1; i < W; ++i) r = op(r, v[i]); - return r; -} +} // namespace experimental +} // namespace util +// --------------------------------------------------------------------------- +// nanovdb::math::Min / Max / Select — Simd overloads. Scalar overloads +// live in nanovdb/math/Math.h; defining the SIMD overloads here avoids a +// Math.h -> Simd.h dependency. +// --------------------------------------------------------------------------- +namespace math { +#ifdef NANOVDB_USE_STDX_SIMD template -__hostdev__ bool any_of(SimdMask m) { - bool r = false; for (int i = 0; i < W; i++) r |= m[i]; return r; +NANOVDB_FORCEINLINE util::experimental::Simd +Min(util::experimental::Simd a, util::experimental::Simd b) { + return std::experimental::min(a, b); } template -__hostdev__ bool none_of(SimdMask m) { return !any_of(m); } -template -__hostdev__ bool all_of(SimdMask m) { - bool r = true; for (int i = 0; i < W; i++) r &= m[i]; return r; +NANOVDB_FORCEINLINE util::experimental::Simd +Max(util::experimental::Simd a, util::experimental::Simd b) { + return std::experimental::max(a, b); } - -// Store W lanes of v into p[0..W-1] (array-backend passthrough to member). +// 3-arg Select(mask, a, b): mask[i] ? a[i] : b[i], via TS v2's where() proxy. template -__hostdev__ void store(Simd v, T* p, element_aligned_tag = {}) { - v.store(p); +NANOVDB_FORCEINLINE util::experimental::Simd +Select(util::experimental::SimdMask mask, + util::experimental::Simd a, + util::experimental::Simd b) { + auto result = b; + util::experimental::stdx::where(mask, result) = a; + return result; } - -#endif // NANOVDB_USE_STD_SIMD - -// --------------------------------------------------------------------------- -// simd_cast — element-wise static_cast between Simd types of the same W. -// -// Used for widening between integer element types (uint16_t → uint32_t, -// uint32_t → uint64_t). Both backends: the array backend uses a lane loop; -// the stdx backend uses the generator constructor, which the compiler lowers -// to a vpmovsxbw / vpmovzxwd sequence or similar sign/zero-extend instruction -// depending on the types. -// Scalar overload: degrades to static_cast for plain scalar types. -// --------------------------------------------------------------------------- -template -__hostdev__ Simd simd_cast(Simd src) { -#ifdef NANOVDB_USE_STD_SIMD - return Simd([&](int i) { return static_cast(src[i]); }); #else - Simd r; - for (int i = 0; i < W; ++i) r[i] = static_cast(src[i]); +template +__hostdev__ util::experimental::Simd +Min(util::experimental::Simd a, util::experimental::Simd b) { + util::experimental::Simd r; + for (int i = 0; i < W; i++) r[i] = a[i] < b[i] ? a[i] : b[i]; return r; -#endif } -template -__hostdev__ DstT simd_cast(SrcT src) { return static_cast(src); } - -// --------------------------------------------------------------------------- -// simd_cast_if — masked element-wise cast (merge-masked). -// -// dst[i] = mask[i] ? static_cast(src[i]) : dst[i] (unchanged) -// -// Typical use: widen an integer index type into a wider type before arithmetic, -// keeping invalid (masked-out) lanes at their initial value (usually 0). -// On AVX-512 the compiler may emit a single masked vcvt/vpmovzx instruction. -// On AVX2 it lowers to an unmasked cast + blend. -// -// Scalar fallback: plain conditional cast. -// --------------------------------------------------------------------------- -template -__hostdev__ void simd_cast_if(Simd& dst, SimdMask mask, Simd src) { - dst = where(mask, simd_cast(src), dst); -} -template -__hostdev__ void simd_cast_if(DstT& dst, bool mask, SrcT src) { - if (mask) dst = static_cast(src); -} - -// --------------------------------------------------------------------------- -// simd_traits — generic per-lane access for scalar and Simd types. -// -// Lets algorithms be written once and work for both scalar (width=1) and -// vector (width=W) instantiations. The class does not need to know whether -// it is working with scalars or SIMD vectors. -// -// Primary template: scalar types. -// Specializations below: Simd and SimdMask (both backends). -// --------------------------------------------------------------------------- -template -struct simd_traits { - static constexpr int width = 1; - using scalar_type = T; - __hostdev__ static T get(T v, int) { return v; } - __hostdev__ static void set(T& v, int, T val) { v = val; } -}; - -template<> -struct simd_traits { - static constexpr int width = 1; - using scalar_type = bool; - __hostdev__ static bool get(bool m, int) { return m; } - __hostdev__ static void set(bool& m, int, bool v) { m = v; } -}; - -// Simd and SimdMask: valid for both backends because the aliases -// are already resolved by the time these specializations are instantiated. -template -struct simd_traits> { - static constexpr int width = W; - using scalar_type = T; - __hostdev__ static T get(Simd v, int i) { return v[i]; } - __hostdev__ static void set(Simd& v, int i, T val) { v[i] = val; } -}; - -template -struct simd_traits> { - static constexpr int width = W; - using scalar_type = bool; - __hostdev__ static bool get(SimdMask m, int i) { return m[i]; } - __hostdev__ static void set(SimdMask& m, int i, bool v) { m[i] = v; } -}; - -// --------------------------------------------------------------------------- -// scalar_traits — extract the scalar element type from T or Simd. -// -// Primary template: a plain scalar type is its own element type. -// The = void default parameter reserves a slot for enable_if specialisations. -// Specialisation for Simd: the element type is T. -// scalar_traits_t is a convenience alias for typename scalar_traits::type. -// --------------------------------------------------------------------------- -template -struct scalar_traits { using type = T; }; - template -struct scalar_traits> { using type = T; }; - -template -using scalar_traits_t = typename scalar_traits::type; - - -// --------------------------------------------------------------------------- -// to_bitmask — fold SimdMask into a uint32_t (one bit per lane). -// T is the associated element type; only W matters. Requires W <= 32. -// --------------------------------------------------------------------------- -template -__hostdev__ uint32_t to_bitmask(SimdMask m) { - static_assert(W <= 32, "to_bitmask: W must be <= 32"); - uint32_t r = 0; - for (int i = 0; i < W; i++) if (m[i]) r |= (1u << i); +__hostdev__ util::experimental::Simd +Max(util::experimental::Simd a, util::experimental::Simd b) { + util::experimental::Simd r; + for (int i = 0; i < W; i++) r[i] = a[i] > b[i] ? a[i] : b[i]; return r; } - -// --------------------------------------------------------------------------- -// Scalar overloads — always present, for T=float (GPU / scalar path) -// --------------------------------------------------------------------------- -template __hostdev__ T min(T a, T b) { return a < b ? a : b; } -template __hostdev__ T max(T a, T b) { return a > b ? a : b; } -template __hostdev__ T where(bool m, T a, T b) { return m ? a : b; } -template -__hostdev__ T reduce(T v, BinaryOp) { return v; } - -// 2-argument where: scalar masked-assignment proxy matching the Simd form. -// where(mask, target) = value writes value into target only if mask is true. -template -struct ScalarWhereProxy { - bool mask; T& target; - __hostdev__ void operator=(const T& v) { if (mask) target = v; } -}; -template -__hostdev__ ScalarWhereProxy where(bool mask, T& target) { - return {mask, target}; +template +__hostdev__ util::experimental::Simd +Select(util::experimental::SimdMask mask, + util::experimental::Simd a, + util::experimental::Simd b) { + util::experimental::Simd r; + for (int i = 0; i < W; i++) r[i] = mask[i] ? a[i] : b[i]; + return r; } +#endif +} // namespace math -} // namespace util } // namespace nanovdb diff --git a/nanovdb/nanovdb/util/WenoStencil.h b/nanovdb/nanovdb/util/WenoStencil.h index 3e384c7617..4f3d18b781 100644 --- a/nanovdb/nanovdb/util/WenoStencil.h +++ b/nanovdb/nanovdb/util/WenoStencil.h @@ -92,9 +92,9 @@ WENO5(const T& v1, const T& v2, const T& v3, // MaskT (mask type that `>` of T produces). Ground-truth scalar version is // nanovdb::math::GodunovsNormSqrd in nanovdb/math/Stencils.h, which uses a // runtime if/else on `isOutside`. Here we compute both branches uncondition- -// ally and blend via util::where, so the SIMD path has no control-flow -// divergence across lanes. At T=float the scalar where(bool, T, T) overload -// degenerates this to the same semantics as the if/else. +// ally and blend via math::Select, so the SIMD path has no control-flow +// divergence across lanes. At T=float the scalar math::Select(bool, T, T) +// overload degenerates this to the same semantics as the if/else. // --------------------------------------------------------------------------- template __hostdev__ NANOVDB_FORCEINLINE T @@ -103,18 +103,18 @@ GodunovsNormSqrd(MaskT isOutside, T dP_ym, T dP_yp, T dP_zm, T dP_zp) { - using util::min; using util::max; using util::where; + using math::Min; using math::Max; using math::Select; const T zero(0.f); - const T outside = max(math::Pow2(max(dP_xm, zero)), math::Pow2(min(dP_xp, zero))) // (dP/dx)² - + max(math::Pow2(max(dP_ym, zero)), math::Pow2(min(dP_yp, zero))) // (dP/dy)² - + max(math::Pow2(max(dP_zm, zero)), math::Pow2(min(dP_zp, zero))); // (dP/dz)² + const T outside = Max(math::Pow2(Max(dP_xm, zero)), math::Pow2(Min(dP_xp, zero))) // (dP/dx)² + + Max(math::Pow2(Max(dP_ym, zero)), math::Pow2(Min(dP_yp, zero))) // (dP/dy)² + + Max(math::Pow2(Max(dP_zm, zero)), math::Pow2(Min(dP_zp, zero))); // (dP/dz)² - const T inside = max(math::Pow2(min(dP_xm, zero)), math::Pow2(max(dP_xp, zero))) - + max(math::Pow2(min(dP_ym, zero)), math::Pow2(max(dP_yp, zero))) - + max(math::Pow2(min(dP_zm, zero)), math::Pow2(max(dP_zp, zero))); + const T inside = Max(math::Pow2(Min(dP_xm, zero)), math::Pow2(Max(dP_xp, zero))) + + Max(math::Pow2(Min(dP_ym, zero)), math::Pow2(Max(dP_yp, zero))) + + Max(math::Pow2(Min(dP_zm, zero)), math::Pow2(Max(dP_zp, zero))); - return where(isOutside, outside, inside); + return Select(isOutside, outside, inside); } } // namespace detail @@ -131,8 +131,8 @@ template class WenoStencil { public: - using FloatV = util::Simd ; - using MaskV = util::SimdMask; + using FloatV = util::experimental::Simd ; + using MaskV = util::experimental::SimdMask; // --- Tap-offset types (compile-time only) ----------------------------- // TapPoint carries the tap offset as a type. Taps is the @@ -277,10 +277,10 @@ WenoStencil::extrapolate(float absBackground) // copysign(absBg, inner): +absBg if inner >= 0, else -absBg. const MaskV isNegInner = zero > values[kInner]; - const FloatV extrap = util::where(isNegInner, -absBg, absBg); + const FloatV extrap = math::Select(isNegInner, -absBg, absBg); // Active lanes keep their own value; inactive lanes take the extrapolated sign-corrected background. - values[k] = util::where(isActive[k], values[k], extrap); + values[k] = math::Select(isActive[k], values[k], extrap); } } From 9b2ef25f937cab6ca519533d8a21e9ebb4b7d476 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Tue, 28 Apr 2026 10:01:27 -0500 Subject: [PATCH 57/60] WenoStencil: ValueType-templated, generic-T WENO5/Godunov, rename tap->point Refactor WenoStencil -> WenoStencil: scalar / CUDA reads as WenoStencil, CPU SIMD as WenoStencil>. MaskType auto-deduced via small detail::mask_of trait. Generic WENO5 and GodunovsNormSqrd helpers re-templated on ValueType (plus optional ScalarType for scale2 precision); using math::Pow2 inside. Drop ground-truth references to math/Stencils.h. Rename TapPoint/Taps/tapIndex/findTap/fillTaps -> StencilPoint/ StencilPoints/pointIndex/findPoint/fillStencil; lowercase template params (i,j,k) with di/dj/dk members (math delta semantics). Scrub non-ASCII characters from all touched files. Signed-off-by: Efty Sifakis --- .../ex_weno_nanovdb_cpu/weno_nanovdb_cpu.cpp | 32 +- nanovdb/nanovdb/util/LegacyStencilAccessor.h | 66 +-- nanovdb/nanovdb/util/Simd.h | 17 +- nanovdb/nanovdb/util/WenoStencil.h | 380 +++++++++--------- 4 files changed, 261 insertions(+), 234 deletions(-) diff --git a/nanovdb/nanovdb/examples/ex_weno_nanovdb_cpu/weno_nanovdb_cpu.cpp b/nanovdb/nanovdb/examples/ex_weno_nanovdb_cpu/weno_nanovdb_cpu.cpp index 8def5c3dc2..8acdd8923f 100644 --- a/nanovdb/nanovdb/examples/ex_weno_nanovdb_cpu/weno_nanovdb_cpu.cpp +++ b/nanovdb/nanovdb/examples/ex_weno_nanovdb_cpu/weno_nanovdb_cpu.cpp @@ -7,7 +7,7 @@ \brief End-to-end CPU WENO5 norm-square-gradient on a narrow-band level set, with a scalar reference for correctness validation. - Demonstrates the full Phase-2+3 pipeline that BatchAccessor.md §11 has + Demonstrates the full Phase-2+3 pipeline that BatchAccessor.md Sec. 11 has been leading up to: VBM decode -> per-batch sidecar value assembly -> out-of-band @@ -85,7 +85,7 @@ using LeafT = nanovdb::NanoLeaf; using FloatGridT = nanovdb::NanoGrid; using CPUVBM = nanovdb::tools::VoxelBlockManager; -using LegacyAccT = nanovdb::LegacyStencilAccessor>; +using LegacyAccT = nanovdb::LegacyStencilAccessor>; // ============================================================ // VDB loading and NanoVDB conversion @@ -117,9 +117,9 @@ loadFloatGridFromVdb(const std::string& path, const std::string& gridName) } /// NanoVDB conversion products shared across the two passes. -/// - floatHandle : NanoGrid — tile values + in-leaf inactive values +/// - floatHandle : NanoGrid -- tile values + in-leaf inactive values /// preserved verbatim, used by the scalar reference stencil. -/// - indexHandle : NanoGrid — the topology-only index grid. +/// - indexHandle : NanoGrid -- the topology-only index grid. /// - sidecar : float sidecar (slot 0 = background, slots 1..N = active /// voxel values in NanoVDB indexing order). struct ConvertedGrids { @@ -158,7 +158,7 @@ convertFloatGrid(openvdb::FloatGrid& floatGrid) } // ============================================================ -// Reference pass — scalar WenoStencil per active voxel +// Reference pass -- scalar WenoStencil per active voxel // ============================================================ // // Uses nanovdb::math::WenoStencil>. Its moveTo(ijk) @@ -206,7 +206,7 @@ runReference(const FloatGridT& floatGrid, // The two grids share topology, so leaf LID in the // index grid aligns with leaf LID in the float grid // (same order of insertion). Iterate the index grid's - // active voxels — those are the slots we need to fill. + // active voxels -- those are the slots we need to fill. const auto& indexLeaf = firstIndexLeaf[lid]; (void)firstFloatLeaf; // stencil.moveTo uses its own acc @@ -223,7 +223,7 @@ runReference(const FloatGridT& floatGrid, } // ============================================================ -// Fast pass — LegacyStencilAccessor gather + WenoStencil compute +// Fast pass -- LegacyStencilAccessor gather + WenoStencil compute // ============================================================ // // Structure: @@ -265,19 +265,21 @@ runFast(const IndexGridT& index return timeIt([&] { nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), [&](const nanovdb::util::Range1D& range) { + using FloatV = nanovdb::util::experimental::Simd ; + using MaskV = nanovdb::util::experimental::SimdMask; + using StencilT = nanovdb::WenoStencil; + constexpr int SIZE = StencilT::size(); + alignas(64) uint32_t leafIndex[BlockWidth]; alignas(64) uint16_t voxelOffset[BlockWidth]; - // Caller-owned fill-side scratch — scalar scatter from the + // Caller-owned fill-side scratch -- scalar scatter from the // sidecar lands here, then a per-tap SIMD load moves the // data into the stencil's Simd compute view. - alignas(64) float raw_values[nanovdb::WenoStencil::size()][SIMDw]; - alignas(64) bool raw_active[nanovdb::WenoStencil::size()][SIMDw]; + alignas(64) float raw_values[SIZE][SIMDw]; + alignas(64) bool raw_active[SIZE][SIMDw]; - nanovdb::WenoStencil stencil(dx); - constexpr int SIZE = nanovdb::WenoStencil::size(); - using FloatV = nanovdb::util::experimental::Simd ; - using MaskV = nanovdb::util::experimental::SimdMask; + StencilT stencil(dx); // One LegacyStencilAccessor per TBB task (one ReadAccessor). LegacyAccT legacyAcc(indexGrid); @@ -359,7 +361,7 @@ runFast(const IndexGridT& index // from 0 to 1e+1 plus a tail bucket for anything >= 10. // // Expected shape: the two leftmost bins ([0,1e-8), [1e-8,1e-7)) hold -// the overwhelming majority — FP-rounding / FMA-fusion differences. +// the overwhelming majority -- FP-rounding / FMA-fusion differences. // Anything to the right of [1e-5,1e-4) warrants investigation. static void diff --git a/nanovdb/nanovdb/util/LegacyStencilAccessor.h b/nanovdb/nanovdb/util/LegacyStencilAccessor.h index 06a2a263b3..377267fa48 100644 --- a/nanovdb/nanovdb/util/LegacyStencilAccessor.h +++ b/nanovdb/nanovdb/util/LegacyStencilAccessor.h @@ -6,13 +6,13 @@ \brief Scalar stencil-index accessor using a NanoVDB ReadAccessor. - LegacyStencilAccessor resolves each stencil tap via a path-cached + LegacyStencilAccessor resolves each stencil point via a path-cached NanoVDB ReadAccessor, one voxel at a time. It is templatized on a - StencilT policy class whose Taps tuple defines the tap offsets. + StencilT policy class whose StencilPoints tuple defines the point offsets. This mirrors the approach of OpenVDB's math/Stencils.h: the accessor - caches the last-visited tree path so that consecutive taps within the - same leaf are cheap, but distant taps (e.g. WENO5 radius-3 offsets) + caches the last-visited tree path so that consecutive points within the + same leaf are cheap, but distant points (e.g. WENO5 radius-3 offsets) can evict the center-leaf path. Thread safety @@ -23,9 +23,9 @@ ------------------- BuildT NanoVDB build type (e.g. ValueOnIndex). StencilT Policy class describing the stencil. Must expose: - using Taps = std::tuple...>; + using StencilPoints = std::tuple...>; where each S is any type with static int members di, dj, dk - (e.g. WenoStencil<>::TapPoint). + (e.g. WenoStencil::StencilPoint). */ #pragma once @@ -44,20 +44,21 @@ class LegacyStencilAccessor { using GridT = NanoGrid; - static constexpr int SIZE = int(std::tuple_size_v); + static constexpr int SIZE = int(std::tuple_size_v); - // Compile-time inverse map: (DI,DJ,DK) → slot index in StencilT::Taps. - // Returns -1 if no matching tap exists; getValue() turns that into a - // static_assert. Same shape as WenoStencil::findTap (kept local here - // to avoid a cross-header dependency). - template - static constexpr int findTap(std::index_sequence) + // Compile-time inverse map: (i,j,k) -> slot index in + // StencilT::StencilPoints. Returns -1 if no matching point exists; + // getValue() turns that into a static_assert. Same shape as + // WenoStencil::findPoint (kept local here to avoid a cross-header + // dependency). + template + static constexpr int findPoint(std::index_sequence) { - using Taps = typename StencilT::Taps; + using StencilPoints = typename StencilT::StencilPoints; int result = -1; - ((std::tuple_element_t::di == DI && - std::tuple_element_t::dj == DJ && - std::tuple_element_t::dk == DK && + ((std::tuple_element_t::di == i && + std::tuple_element_t::dj == j && + std::tuple_element_t::dk == k && result < 0 ? (result = int(Is)) : 0), ...); return result; } @@ -75,32 +76,33 @@ class LegacyStencilAccessor : mAcc(grid.tree().root()) {} // ------------------------------------------------------------------------- - // moveTo -- resolve all SIZE tap indices for the voxel at @a center. + // moveTo -- resolve all SIZE stencil-point indices for the voxel at @a center. // - // Calls ReadAccessor::getValue(center + offset) for each tap in StencilT::Taps. - // The path cache inside mAcc amortizes tree-traversal cost for nearby taps, - // but distant taps (e.g. WENO5 ±3) may evict the center-leaf path. + // Calls ReadAccessor::getValue(center + offset) for each point in + // StencilT::StencilPoints. The path cache inside mAcc amortizes + // tree-traversal cost for nearby points, but distant points (e.g. WENO5 + // +/-3) may evict the center-leaf path. // // Results are valid until the next moveTo() call. // ------------------------------------------------------------------------- void moveTo(const Coord& center) { - fillTaps(center, std::make_index_sequence{}); + fillStencil(center, std::make_index_sequence{}); } // ------------------------------------------------------------------------- - // operator[] -- indexed tap access. i must be in [0, SIZE). + // operator[] -- indexed point access. i must be in [0, SIZE). // ------------------------------------------------------------------------- uint64_t operator[](int i) const { return mStencil[i]; } // ------------------------------------------------------------------------- - // getValue -- compile-time named tap access. + // getValue -- compile-time named point access. // ------------------------------------------------------------------------- - template + template uint64_t getValue() const { - constexpr int I = findTap(std::make_index_sequence{}); - static_assert(I >= 0, "LegacyStencilAccessor::getValue: tap not in stencil"); + constexpr int I = findPoint(std::make_index_sequence{}); + static_assert(I >= 0, "LegacyStencilAccessor::getValue: point not in stencil"); return mStencil[I]; } @@ -108,14 +110,14 @@ class LegacyStencilAccessor private: template - void fillTaps(const Coord& center, std::index_sequence) + void fillStencil(const Coord& center, std::index_sequence) { - using Taps = typename StencilT::Taps; + using StencilPoints = typename StencilT::StencilPoints; ((mStencil[Is] = static_cast( mAcc.getValue(center + Coord( - std::tuple_element_t::di, - std::tuple_element_t::dj, - std::tuple_element_t::dk)))), ...); + std::tuple_element_t::di, + std::tuple_element_t::dj, + std::tuple_element_t::dk)))), ...); } AccessorT mAcc; diff --git a/nanovdb/nanovdb/util/Simd.h b/nanovdb/nanovdb/util/Simd.h index 666acab0e4..7d8be384db 100644 --- a/nanovdb/nanovdb/util/Simd.h +++ b/nanovdb/nanovdb/util/Simd.h @@ -11,9 +11,10 @@ \brief Minimal SIMD abstraction for NanoVDB stencil kernels. */ -#pragma once +#ifndef NANOVDB_UTIL_SIMD_H_HAS_BEEN_INCLUDED +#define NANOVDB_UTIL_SIMD_H_HAS_BEEN_INCLUDED + #include -#include #include // __hostdev__ @@ -45,13 +46,13 @@ namespace nanovdb { namespace util { // =========================================================================== -// nanovdb::util::experimental — internal SIMD primitives. Names in this +// nanovdb::util::experimental -- internal SIMD primitives. Names in this // nested namespace are unstable API by convention; external callers should // not depend on them. // =========================================================================== namespace experimental { -// element_aligned_tag — load/store alignment descriptor. Aliases +// element_aligned_tag -- load/store alignment descriptor. Aliases // stdx::element_aligned_tag in the stdx backend; empty struct in the // array backend. #ifdef NANOVDB_USE_STDX_SIMD @@ -63,7 +64,7 @@ struct element_aligned_tag {}; inline constexpr element_aligned_tag element_aligned{}; // =========================================================================== -// Implementation A: std::experimental::simd — pure type aliases +// Implementation A: std::experimental::simd -- pure type aliases // =========================================================================== #ifdef NANOVDB_USE_STDX_SIMD @@ -177,7 +178,7 @@ struct Simd { __hostdev__ SimdMask operator!=(Simd o) const { SimdMask m; for (int i = 0; i < W; i++) m.data[i] = data[i] != o.data[i]; return m; } - // Bitwise and shift operators — valid for integer element types. + // Bitwise and shift operators -- valid for integer element types. __hostdev__ Simd operator|(Simd o) const { Simd r; for (int i = 0; i < W; i++) r.data[i] = data[i] | o.data[i]; return r; } @@ -226,7 +227,7 @@ Simd operator/(Simd a, T b) { return a / Simd(b); } } // namespace util // --------------------------------------------------------------------------- -// nanovdb::math::Min / Max / Select — Simd overloads. Scalar overloads +// nanovdb::math::Min / Max / Select -- Simd overloads. Scalar overloads // live in nanovdb/math/Math.h; defining the SIMD overloads here avoids a // Math.h -> Simd.h dependency. // --------------------------------------------------------------------------- @@ -280,3 +281,5 @@ Select(util::experimental::SimdMask mask, } // namespace math } // namespace nanovdb + +#endif // end of NANOVDB_UTIL_SIMD_H_HAS_BEEN_INCLUDED diff --git a/nanovdb/nanovdb/util/WenoStencil.h b/nanovdb/nanovdb/util/WenoStencil.h index 4f3d18b781..1c50b6ca5d 100644 --- a/nanovdb/nanovdb/util/WenoStencil.h +++ b/nanovdb/nanovdb/util/WenoStencil.h @@ -4,37 +4,37 @@ /*! \file WenoStencil.h - \brief 19-tap WENO5 stencil value container + out-of-band extrapolation + + \brief 19-point WENO5 stencil value container + out-of-band extrapolation + fifth-order upwind Godunov's norm-square gradient. Templated on - SIMD lane width W. + a ValueType. - `WenoStencil` holds the per-tap float values and activity flags for a - single voxel (W=1, scalar / GPU-friendly) or a batch of W voxels (W>1, - CPU SIMD). Storage is first-class Simd types directly: + `WenoStencil` holds the per-point values and activity flags. + ValueType is typically a raw scalar `float` (e.g. for scalar / CUDA code) + or a SIMD vector `Simd` (CPU batch path). The companion mask + type is auto-deduced (bool for raw scalars, ValueType::mask_type + otherwise): - FloatV values [19] ≡ Simd values [19] - MaskV isActive[19] ≡ SimdMask isActive[19] + ValueType values [19] + MaskType isActive[19] - At W=1 the Simd types collapse to plain float / bool so scalar CUDA code - reads as plain scalar arithmetic, and the class is pure-compute — the - caller owns any fill-side C-array storage it wants to use for scalar - scatter before an explicit load-into-Simd step. + For raw scalars the class reads as plain scalar arithmetic; for SIMD + vectors the same source compiles to whole-vector ops via Simd.h. The + class is pure-compute -- the caller owns any fill-side C-array storage + and the per-point load step. - Grid-spacing scalars `mDx2` and `mInvDx2` stay scalar `float` at every - W and are broadcast to FloatV at the point of use. + Grid-spacing scalars `mDx2` and `mInvDx2` stay scalar `float` and are + broadcast to ValueType at the point of use. Operations provided: - extrapolate(absBackground) Repair out-of-band lanes (isActive[k] == false) via - copysign(absBackground, values[innerTap]), processed in - ascending-|Δ| order so the inner tap is already resolved. + copysign(absBackground, values[innerPoint]), processed in + ascending-|d| order so the inner point is already resolved. - normSqGrad(isoValue = 0) Godunov's norm-square of the fifth-order WENO upwind gradient. - Matches the semantics of WenoStencil::normSqGrad(isoValue) in - nanovdb/math/Stencils.h (the ground-truth scalar reference). - See BatchAccessor.md §11 for the full Phase-2 sidecar-WENO pipeline design - and §11.2 for the extrapolation semantics. + See BatchAccessor.md Sec. 11 for the full Phase-2 sidecar-WENO pipeline design + and Sec. 11.2 for the extrapolation semantics. */ #pragma once @@ -44,6 +44,7 @@ #include #include +#include #include namespace nanovdb { @@ -51,189 +52,214 @@ namespace nanovdb { namespace detail { // --------------------------------------------------------------------------- -// Generic-T WENO5 reconstruction — templated on T ∈ {float, Simd}. +// mask_of::type -- auto-deduced predicate type for ValueType T: +// - T::mask_type if T has a nested mask_type (e.g. Simd); +// - bool otherwise (e.g. raw scalar T=float). +// --------------------------------------------------------------------------- +template +struct mask_of { using type = bool; }; + +template +struct mask_of> { + using type = typename T::mask_type; +}; + +// --------------------------------------------------------------------------- +// Generic WENO5 reconstruction -- templated on ValueType in {float, +// Simd}. Nominally fifth-order finite-difference WENO (Chi-Wang +// Shu, "High Order Finite Difference and Finite Volume WENO Schemes and +// Discontinuous Galerkin Methods for CFD", ICASE Report No 2001-11 page 6; +// see also ICASE 97-65 for a more complete reference, Shu 1997). // -// Structurally identical to nanovdb::math::WENO5 (ground-truth scalar WENO5 -// in nanovdb/math/Stencils.h), transliterated to use only primitives that -// exist for both scalar T=float and Simd: operator+/-/*, math::Pow2. -// No ternaries, no if/else — same source compiles to scalar or SIMD code -// via the Simd backend in Simd.h. +// Given v1=f(x-2dx), v2=f(x-dx), v3=f(x), v4=f(x+dx), v5=f(x+2dx), returns +// an interpolated f(x+dx/2) with the property that +// (f(x+dx/2)-f(x-dx/2))/dx = df/dx(x) + error, with error fifth-order in +// smooth regions: O(dx) <= error <= O(dx^5). // -// scale2 is the optional reference magnitude (squared) used to scale the -// numerical epsilon; kept as a plain float for broadcast-on-demand. +// Body uses only primitives common to scalar and Simd ValueType +// (operator+/-/*, math::Pow2), so the same source compiles in both modes +// via the Simd backend in Simd.h. Integer coefficients carry explicit +// ValueType(...) casts for SIMD typed-operator dispatch, and float +// literals carry .f suffix because stdx::simd's broadcast ctor rejects +// double->float narrowing. +// +// ScalarType (defaults to float, deduced from scale2 if specified) is the +// arithmetic precision of the reference-magnitude epsilon scaling. scale2 +// stays scalar so callers can pass plain float/double grid constants +// without broadcasting. // --------------------------------------------------------------------------- -template -__hostdev__ NANOVDB_FORCEINLINE T -WENO5(const T& v1, const T& v2, const T& v3, - const T& v4, const T& v5, - float scale2 = 1.f) +template +__hostdev__ NANOVDB_FORCEINLINE ValueType +WENO5(const ValueType& v1, const ValueType& v2, const ValueType& v3, + const ValueType& v4, const ValueType& v5, + ScalarType scale2 = ScalarType(1)) // openvdb uses scale2 = 0.01 { - const RealT C = RealT(13.f / 12.f); - const RealT eps = RealT(1.e-6f * scale2); - - const RealT A1 = RealT(0.1f) / math::Pow2( - C * math::Pow2(v1 - RealT(2)*v2 + v3) - + RealT(0.25f) * math::Pow2(v1 - RealT(4)*v2 + RealT(3)*v3) + eps); - const RealT A2 = RealT(0.6f) / math::Pow2( - C * math::Pow2(v2 - RealT(2)*v3 + v4) - + RealT(0.25f) * math::Pow2(v2 - v4) + eps); - const RealT A3 = RealT(0.3f) / math::Pow2( - C * math::Pow2(v3 - RealT(2)*v4 + v5) - + RealT(0.25f) * math::Pow2(RealT(3)*v3 - RealT(4)*v4 + v5) + eps); - - return (A1 * (RealT( 2)*v1 - RealT(7)*v2 + RealT(11)*v3) - + A2 * (RealT( 5)*v3 - v2 + RealT( 2)*v4) - + A3 * (RealT( 2)*v3 + RealT(5)*v4 - v5)) - / (RealT(6) * (A1 + A2 + A3)); + using math::Pow2; + const ValueType C = ValueType(13.f / 12.f); + // WENO is formulated for non-dimensional equations, here the optional scale2 + // is a reference value (squared) for the function being interpolated. For + // example if 'v' is of order 1000, then scale2 = 10^6 is ok. But in practice + // leave scale2 = 1. + const ValueType eps = ValueType(ScalarType(1.e-6) * scale2); + // {\tilde \omega_k} = \gamma_k / (\beta_k + \epsilon)^2 in Shu's ICASE report. + const ValueType A1 = ValueType(0.1f) / Pow2( + C * Pow2(v1 - ValueType(2)*v2 + v3) + + ValueType(0.25f) * Pow2(v1 - ValueType(4)*v2 + ValueType(3)*v3) + eps); + const ValueType A2 = ValueType(0.6f) / Pow2( + C * Pow2(v2 - ValueType(2)*v3 + v4) + + ValueType(0.25f) * Pow2(v2 - v4) + eps); + const ValueType A3 = ValueType(0.3f) / Pow2( + C * Pow2(v3 - ValueType(2)*v4 + v5) + + ValueType(0.25f) * Pow2(ValueType(3)*v3 - ValueType(4)*v4 + v5) + eps); + + return (A1 * (ValueType(2)*v1 - ValueType(7)*v2 + ValueType(11)*v3) + + A2 * (ValueType(5)*v3 - v2 + ValueType( 2)*v4) + + A3 * (ValueType(2)*v3 + ValueType(5)*v4 - v5)) + / (ValueType(6) * (A1 + A2 + A3)); } // --------------------------------------------------------------------------- -// Generic-T Godunov's norm-square gradient — templated on T (value type) and -// MaskT (mask type that `>` of T produces). Ground-truth scalar version is -// nanovdb::math::GodunovsNormSqrd in nanovdb/math/Stencils.h, which uses a -// runtime if/else on `isOutside`. Here we compute both branches uncondition- -// ally and blend via math::Select, so the SIMD path has no control-flow -// divergence across lanes. At T=float the scalar math::Select(bool, T, T) -// overload degenerates this to the same semantics as the if/else. +// Generic Godunov's norm-square gradient -- templated on ValueType +// in {float, Simd} and a companion MaskType (bool for raw scalar, +// ValueType::mask_type for Simd). Differs from a textbook scalar form in +// shape: instead of an if/else on isOutside we compute both branches +// unconditionally and blend via math::Select, so the SIMD path has no +// control-flow divergence across lanes. At ValueType=float the scalar +// math::Select(bool, ValueType, ValueType) overload reduces this to the +// equivalent if/else semantics. // --------------------------------------------------------------------------- -template -__hostdev__ NANOVDB_FORCEINLINE T -GodunovsNormSqrd(MaskT isOutside, - T dP_xm, T dP_xp, - T dP_ym, T dP_yp, - T dP_zm, T dP_zp) +template +__hostdev__ NANOVDB_FORCEINLINE ValueType +GodunovsNormSqrd(MaskType isOutside, + ValueType dP_xm, ValueType dP_xp, + ValueType dP_ym, ValueType dP_yp, + ValueType dP_zm, ValueType dP_zp) { - using math::Min; using math::Max; using math::Select; - const T zero(0.f); + using math::Min; using math::Max; using math::Pow2; using math::Select; + const ValueType zero(0.f); - const T outside = Max(math::Pow2(Max(dP_xm, zero)), math::Pow2(Min(dP_xp, zero))) // (dP/dx)² - + Max(math::Pow2(Max(dP_ym, zero)), math::Pow2(Min(dP_yp, zero))) // (dP/dy)² - + Max(math::Pow2(Max(dP_zm, zero)), math::Pow2(Min(dP_zp, zero))); // (dP/dz)² + const ValueType outside = Max(Pow2(Max(dP_xm, zero)), Pow2(Min(dP_xp, zero))) // (dP/dx)^2 + + Max(Pow2(Max(dP_ym, zero)), Pow2(Min(dP_yp, zero))) // (dP/dy)^2 + + Max(Pow2(Max(dP_zm, zero)), Pow2(Min(dP_zp, zero))); // (dP/dz)^2 - const T inside = Max(math::Pow2(Min(dP_xm, zero)), math::Pow2(Max(dP_xp, zero))) - + Max(math::Pow2(Min(dP_ym, zero)), math::Pow2(Max(dP_yp, zero))) - + Max(math::Pow2(Min(dP_zm, zero)), math::Pow2(Max(dP_zp, zero))); + const ValueType inside = Max(Pow2(Min(dP_xm, zero)), Pow2(Max(dP_xp, zero))) + + Max(Pow2(Min(dP_ym, zero)), Pow2(Max(dP_yp, zero))) + + Max(Pow2(Min(dP_zm, zero)), Pow2(Max(dP_zp, zero))); - return Select(isOutside, outside, inside); + return Select(isOutside, outside, inside); // |\nabla\phi|^2 } } // namespace detail // --------------------------------------------------------------------------- -// WenoStencil — pure-compute container for a 19-tap WENO5 stencil state. -// -// The class holds only Simd-typed compute state + scalar grid constants. -// Fill-side responsibility (scalar writes into any raw float/bool buffers, -// followed by a SIMD load-per-tap into this stencil's values[] / isActive[]) -// lives in the caller. See WenoStencil.md §6 for usage patterns. +// WenoStencil -- pure-compute container for a 19-point WENO5 +// stencil state. Holds ValueType-typed values + MaskType-typed activity +// flags + scalar grid constants. Fill-side responsibility (scalar writes +// into any raw buffers, followed by a per-point load into this stencil's +// values[] / isActive[]) lives in the caller. See WenoStencil.md Sec. 6 +// for usage patterns. // --------------------------------------------------------------------------- -template +template class WenoStencil { public: - using FloatV = util::experimental::Simd ; - using MaskV = util::experimental::SimdMask; + using MaskType = typename detail::mask_of::type; - // --- Tap-offset types (compile-time only) ----------------------------- - // TapPoint carries the tap offset as a type. Taps is the - // 19-tap tuple in the canonical WenoPt::idx ordering from - // nanovdb/math/Stencils.h: + // --- Stencil-point offset types (compile-time only) ------------------- + // StencilPoint carries the offset as a type. StencilPoints is + // the 19-point tuple in the canonical idx ordering: // idx 0 : center < 0, 0, 0> // idx 1.. 6 : x-axis <-3,0,0> <-2,0,0> <-1,0,0> <+1,0,0> <+2,0,0> <+3,0,0> // idx 7..12 : y-axis <0,-3,0> <0,-2,0> <0,-1,0> <0,+1,0> <0,+2,0> <0,+3,0> // idx 13..18 : z-axis <0,0,-3> <0,0,-2> <0,0,-1> <0,0,+1> <0,0,+2> <0,0,+3> - template - struct TapPoint { - static constexpr int di = DI, dj = DJ, dk = DK; + template + struct StencilPoint { + static constexpr int di = i, dj = j, dk = k; }; - using Taps = std::tuple< - TapPoint< 0, 0, 0>, - TapPoint<-3, 0, 0>, TapPoint<-2, 0, 0>, TapPoint<-1, 0, 0>, - TapPoint<+1, 0, 0>, TapPoint<+2, 0, 0>, TapPoint<+3, 0, 0>, - TapPoint< 0,-3, 0>, TapPoint< 0,-2, 0>, TapPoint< 0,-1, 0>, - TapPoint< 0,+1, 0>, TapPoint< 0,+2, 0>, TapPoint< 0,+3, 0>, - TapPoint< 0, 0,-3>, TapPoint< 0, 0,-2>, TapPoint< 0, 0,-1>, - TapPoint< 0, 0,+1>, TapPoint< 0, 0,+2>, TapPoint< 0, 0,+3> + using StencilPoints = std::tuple< + StencilPoint< 0, 0, 0>, + StencilPoint<-3, 0, 0>, StencilPoint<-2, 0, 0>, StencilPoint<-1, 0, 0>, + StencilPoint<+1, 0, 0>, StencilPoint<+2, 0, 0>, StencilPoint<+3, 0, 0>, + StencilPoint< 0,-3, 0>, StencilPoint< 0,-2, 0>, StencilPoint< 0,-1, 0>, + StencilPoint< 0,+1, 0>, StencilPoint< 0,+2, 0>, StencilPoint< 0,+3, 0>, + StencilPoint< 0, 0,-3>, StencilPoint< 0, 0,-2>, StencilPoint< 0, 0,-1>, + StencilPoint< 0, 0,+1>, StencilPoint< 0, 0,+2>, StencilPoint< 0, 0,+3> >; - static constexpr int SIZE = int(std::tuple_size_v); + static constexpr int SIZE = int(std::tuple_size_v); static constexpr int size() { return SIZE; } - // Compute-side storage — first-class Simd values. At W=1 these collapse - // to plain scalar float / bool under the array backend. - FloatV values [SIZE]; - MaskV isActive[SIZE]; + // Compute-side storage. At ValueType=float these are plain float / bool + // arrays; at ValueType=Simd they are whole-vector arrays. + ValueType values [SIZE]; + MaskType isActive[SIZE]; - // Runtime grid-spacing constants — plain scalars at every W, broadcast - // to FloatV at the use sites inside normSqGrad(). Storing them as - // scalars saves YMM-register pressure (vbroadcastss folds into the FMA - // consumer on x86) and keeps the W=1 code path free of any Simd wrapper. - float mDx2{1.f}; // dx² — fed to WENO5's epsilon via scale2 - float mInvDx2{1.f}; // 1 / dx² — final normalisation in normSqGrad + // Runtime grid-spacing constants -- plain scalars regardless of ValueType, + // broadcast inside normSqGrad(). + float mDx2{1.f}; // dx^2 -- fed to WENO5's epsilon via scale2 + float mInvDx2{1.f}; // 1 / dx^2 -- final normalisation in normSqGrad __hostdev__ WenoStencil() = default; __hostdev__ explicit WenoStencil(float dx) : mDx2(dx * dx), mInvDx2(1.f / (dx * dx)) {} - // Compile-time named-tap access: returns the index of tap (DI,DJ,DK) in - // the Taps tuple. Ordering matches WenoPt::idx in - // nanovdb/math/Stencils.h. - template - static constexpr int tapIndex() + // Compile-time named-point access: returns the index of point (i,j,k) + // in the StencilPoints tuple. + template + static constexpr int pointIndex() { - constexpr int I = findTap(std::make_index_sequence{}); - static_assert(I >= 0, "WenoStencil::tapIndex: tap not in stencil"); + constexpr int I = findPoint(std::make_index_sequence{}); + static_assert(I >= 0, "WenoStencil::pointIndex: point not in stencil"); return I; } // ------------------------------------------------------------------ - // extrapolate — repair out-of-band lanes (isActive[k][i] == false) of - // values[k] with copysign(absBackground, values[innerTap][i]). Active - // lanes are preserved. Center tap (idx 0) is assumed always in-band - // and is not processed. + // extrapolate -- repair out-of-band lanes (isActive[k][i] == false) of + // values[k] with copysign(absBackground, values[innerPoint][i]). + // Active lanes are preserved. Center point (idx 0) is assumed always + // in-band and is not processed. // - // Processes 18 (tap, innerTap) pairs in ascending-|Δ| order so the - // inner tap is already resolved when the outer tap is touched; - // sign-inheritance through |Δ|=1 → |Δ|=2 → |Δ|=3 is automatic. + // Processes 18 (point, innerPoint) pairs in ascending-|d| order so the + // inner point is already resolved when the outer point is touched; + // sign-inheritance through |d|=1 -> |d|=2 -> |d|=3 is automatic. // - // Requires absBackground ≥ 0. + // Requires absBackground >= 0. // ------------------------------------------------------------------ __hostdev__ NANOVDB_FORCEINLINE void extrapolate(float absBackground); // ------------------------------------------------------------------ - // normSqGrad — Godunov's norm-square of the fifth-order WENO upwind - // gradient at the stencil center. Returns |∇φ|². + // normSqGrad -- Godunov's norm-square of the fifth-order WENO upwind + // gradient at the stencil center. Returns |\nabla\phi|^2. // - // Semantics match WenoStencil::normSqGrad(isoValue) in - // nanovdb/math/Stencils.h line-for-line: six axial WENO5 reconstructions - // (one pair ±x, ±y, ±z), then Godunov's upwind combinator driven by the - // sign of (center − iso). + // Six axial WENO5 reconstructions (one pair +/-x, +/-y, +/-z), then + // Godunov's upwind combinator driven by the sign of (center - iso). // // Call only after the stencil has been populated (see usage pattern in - // WenoStencil.md §6). extrapolate() is idempotent w.r.t. this — calling - // normSqGrad after extrapolate is the typical pipeline shape, but the - // method itself does not require extrapolate to have been called. + // WenoStencil.md Sec. 6). extrapolate() before normSqGrad() is the + // typical pipeline shape but is not required by this method. // ------------------------------------------------------------------ - __hostdev__ NANOVDB_FORCEINLINE FloatV normSqGrad(float iso = 0.f) const; + __hostdev__ NANOVDB_FORCEINLINE ValueType normSqGrad(float iso = 0.f) const; private: - // Compile-time inverse map: (DI,DJ,DK) → slot index in Taps. Returns -1 - // if no matching tap exists; tapIndex() turns that into a static_assert. - template - static constexpr int findTap(std::index_sequence) + // Compile-time inverse map: (i,j,k) -> slot index in StencilPoints. + // Returns -1 if no matching point exists; pointIndex() turns that into + // a static_assert. + template + static constexpr int findPoint(std::index_sequence) { int result = -1; - ((std::tuple_element_t::di == DI && - std::tuple_element_t::dj == DJ && - std::tuple_element_t::dk == DK && + ((std::tuple_element_t::di == i && + std::tuple_element_t::dj == j && + std::tuple_element_t::dk == k && result < 0 ? (result = int(Is)) : 0), ...); return result; } - // Hardcoded (tap, innerTap) pairs for the 19-tap Taps tuple, ordered by - // ascending |Δ| so the inner tap is always already resolved when the - // outer tap is processed. Indices match the Taps tuple above. + // Hardcoded (point, innerPoint) pairs for the 19-point StencilPoints + // tuple, ordered by ascending |d| so the inner point is always already + // resolved when the outer point is processed. Indices match the + // StencilPoints tuple above. // // idx 0 : center ( 0, 0, 0) // idx 1.. 6 : x-axis (-3..+3 in the order -3,-2,-1,+1,+2,+3) @@ -241,43 +267,40 @@ class WenoStencil // idx 13..18 : z-axis (-3..+3) static constexpr int kNumPairs = 18; static constexpr int kPairs[kNumPairs][2] = { - // |Δ|=1 (inner tap = center, idx 0) + // |d|=1 (inner point = center, idx 0) { 3, 0}, { 4, 0}, // x: -1, +1 { 9, 0}, {10, 0}, // y: -1, +1 {15, 0}, {16, 0}, // z: -1, +1 - // |Δ|=2 (inner tap = |Δ|=1 on same axis) - { 2, 3}, { 5, 4}, // x: -2 ← (-1), +2 ← (+1) + // |d|=2 (inner point = |d|=1 on same axis) + { 2, 3}, { 5, 4}, // x: -2 <- (-1), +2 <- (+1) { 8, 9}, {11, 10}, // y {14, 15}, {17, 16}, // z - // |Δ|=3 (inner tap = |Δ|=2 on same axis) - { 1, 2}, { 6, 5}, // x: -3 ← (-2), +3 ← (+2) + // |d|=3 (inner point = |d|=2 on same axis) + { 1, 2}, { 6, 5}, // x: -3 <- (-2), +3 <- (+2) { 7, 8}, {12, 11}, // y {13, 14}, {18, 17} // z }; }; // --------------------------------------------------------------------------- -// extrapolate — single-source implementation. -// -// values[] and isActive[] are already Simd-typed; the algorithm is a -// sequence of whole-SIMD blends (plus a broadcast of absBg) per pair. -// Same source body compiles at W=1 (Simd collapses to scalar) -// and W>1 (native SIMD width). +// extrapolate -- single-source implementation. At ValueType=float this is +// scalar code; at ValueType=Simd the same source compiles to +// whole-SIMD blends via the math::Select dispatch. // --------------------------------------------------------------------------- -template +template __hostdev__ NANOVDB_FORCEINLINE void -WenoStencil::extrapolate(float absBackground) +WenoStencil::extrapolate(float absBackground) { - const FloatV absBg(absBackground); - const FloatV zero (0.f); + const ValueType absBg(absBackground); + const ValueType zero (0.f); for (int p = 0; p < kNumPairs; ++p) { const int k = kPairs[p][0]; const int kInner = kPairs[p][1]; // copysign(absBg, inner): +absBg if inner >= 0, else -absBg. - const MaskV isNegInner = zero > values[kInner]; - const FloatV extrap = math::Select(isNegInner, -absBg, absBg); + const MaskType isNegInner = zero > values[kInner]; + const ValueType extrap = math::Select(isNegInner, -absBg, absBg); // Active lanes keep their own value; inactive lanes take the extrapolated sign-corrected background. values[k] = math::Select(isActive[k], values[k], extrap); @@ -285,30 +308,27 @@ WenoStencil::extrapolate(float absBackground) } // --------------------------------------------------------------------------- -// normSqGrad — Godunov's upwind WENO norm-square gradient. -// -// Structurally mirrors WenoStencil::normSqGrad(isoValue) in -// nanovdb/math/Stencils.h: six axial WENO5 reconstructions driving -// GodunovsNormSqrd. Tap indices 0..18 match WenoPt::idx in that -// file. mInvDx2 and iso are broadcast to FloatV at the final -// combinator only (free on x86; identity at W=1). +// normSqGrad -- Godunov's upwind WENO norm-square gradient. Six axial +// WENO5 reconstructions drive GodunovsNormSqrd; point indices 0..18 follow +// the StencilPoints tuple ordering above. mInvDx2 and iso are broadcast to +// ValueType at the final combinator. // --------------------------------------------------------------------------- -template -__hostdev__ NANOVDB_FORCEINLINE typename WenoStencil::FloatV -WenoStencil::normSqGrad(float iso) const +template +__hostdev__ NANOVDB_FORCEINLINE ValueType +WenoStencil::normSqGrad(float iso) const { - const FloatV* v = values; - - const FloatV dP_xm = detail::WENO5(v[ 2]-v[ 1], v[ 3]-v[ 2], v[ 0]-v[ 3], v[ 4]-v[ 0], v[ 5]-v[ 4], mDx2); - const FloatV dP_xp = detail::WENO5(v[ 6]-v[ 5], v[ 5]-v[ 4], v[ 4]-v[ 0], v[ 0]-v[ 3], v[ 3]-v[ 2], mDx2); - const FloatV dP_ym = detail::WENO5(v[ 8]-v[ 7], v[ 9]-v[ 8], v[ 0]-v[ 9], v[10]-v[ 0], v[11]-v[10], mDx2); - const FloatV dP_yp = detail::WENO5(v[12]-v[11], v[11]-v[10], v[10]-v[ 0], v[ 0]-v[ 9], v[ 9]-v[ 8], mDx2); - const FloatV dP_zm = detail::WENO5(v[14]-v[13], v[15]-v[14], v[ 0]-v[15], v[16]-v[ 0], v[17]-v[16], mDx2); - const FloatV dP_zp = detail::WENO5(v[18]-v[17], v[17]-v[16], v[16]-v[ 0], v[ 0]-v[15], v[15]-v[14], mDx2); - - return FloatV(mInvDx2) * - detail::GodunovsNormSqrd(v[0] > FloatV(iso), - dP_xm, dP_xp, dP_ym, dP_yp, dP_zm, dP_zp); + const ValueType* v = values; + + const ValueType dP_xm = detail::WENO5(v[ 2]-v[ 1], v[ 3]-v[ 2], v[ 0]-v[ 3], v[ 4]-v[ 0], v[ 5]-v[ 4], mDx2); + const ValueType dP_xp = detail::WENO5(v[ 6]-v[ 5], v[ 5]-v[ 4], v[ 4]-v[ 0], v[ 0]-v[ 3], v[ 3]-v[ 2], mDx2); + const ValueType dP_ym = detail::WENO5(v[ 8]-v[ 7], v[ 9]-v[ 8], v[ 0]-v[ 9], v[10]-v[ 0], v[11]-v[10], mDx2); + const ValueType dP_yp = detail::WENO5(v[12]-v[11], v[11]-v[10], v[10]-v[ 0], v[ 0]-v[ 9], v[ 9]-v[ 8], mDx2); + const ValueType dP_zm = detail::WENO5(v[14]-v[13], v[15]-v[14], v[ 0]-v[15], v[16]-v[ 0], v[17]-v[16], mDx2); + const ValueType dP_zp = detail::WENO5(v[18]-v[17], v[17]-v[16], v[16]-v[ 0], v[ 0]-v[15], v[15]-v[14], mDx2); + + return ValueType(mInvDx2) * + detail::GodunovsNormSqrd(v[0] > ValueType(iso), + dP_xm, dP_xp, dP_ym, dP_yp, dP_zm, dP_zp); } } // namespace nanovdb From 7cc54ab809970b653cfe7a380e856d3683e52535 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Tue, 28 Apr 2026 11:10:59 -0500 Subject: [PATCH 58/60] WenoStencil: extrapolate convention switch; add Sqrt(Simd) extrapolate() now takes no parameters. Convention: caller pre-loads inactive lanes of values[k] from sidecar slot 0 (the background); the method then multiplies by Sign(parent), matching the CUDA stencil pipeline's missing-tap behaviour exactly (incl. parent==0 -> 0). Also adds nanovdb::math::Sqrt(Simd) overloads alongside the existing Min/Max/Select. ex_weno_nanovdb_cpu drops the absBackground arg; smoke test on taperLER.vdb stays bit-exact. Signed-off-by: Efty Sifakis --- .../ex_weno_nanovdb_cpu/weno_nanovdb_cpu.cpp | 3 +- nanovdb/nanovdb/util/Simd.h | 18 ++++++-- nanovdb/nanovdb/util/WenoStencil.h | 41 +++++++++++-------- 3 files changed, 41 insertions(+), 21 deletions(-) diff --git a/nanovdb/nanovdb/examples/ex_weno_nanovdb_cpu/weno_nanovdb_cpu.cpp b/nanovdb/nanovdb/examples/ex_weno_nanovdb_cpu/weno_nanovdb_cpu.cpp index 8acdd8923f..71b589af72 100644 --- a/nanovdb/nanovdb/examples/ex_weno_nanovdb_cpu/weno_nanovdb_cpu.cpp +++ b/nanovdb/nanovdb/examples/ex_weno_nanovdb_cpu/weno_nanovdb_cpu.cpp @@ -251,7 +251,6 @@ runFast(const IndexGridT& index const uint64_t* jumpMap = vbmHandle.hostJumpMap(); const uint64_t firstOffset = vbmHandle.firstOffset(); - const float absBackground = std::abs(sidecar[0]); const float dx = float(indexGrid.voxelSize()[0]); std::ostringstream sink; @@ -335,7 +334,7 @@ runFast(const IndexGridT& index } // -------- Phase-3 arithmetic (in-place on Simd values) -------- - stencil.extrapolate(absBackground); + stencil.extrapolate(); const FloatV result = stencil.normSqGrad(/*iso=*/0.f); // -------- Simd -> scalar bridge + per-lane store -------- diff --git a/nanovdb/nanovdb/util/Simd.h b/nanovdb/nanovdb/util/Simd.h index 7d8be384db..363a307fac 100644 --- a/nanovdb/nanovdb/util/Simd.h +++ b/nanovdb/nanovdb/util/Simd.h @@ -227,9 +227,9 @@ Simd operator/(Simd a, T b) { return a / Simd(b); } } // namespace util // --------------------------------------------------------------------------- -// nanovdb::math::Min / Max / Select -- Simd overloads. Scalar overloads -// live in nanovdb/math/Math.h; defining the SIMD overloads here avoids a -// Math.h -> Simd.h dependency. +// nanovdb::math::Min / Max / Select / Sqrt -- Simd overloads. Scalar +// overloads live in nanovdb/math/Math.h; defining the SIMD overloads here +// avoids a Math.h -> Simd.h dependency. // --------------------------------------------------------------------------- namespace math { #ifdef NANOVDB_USE_STDX_SIMD @@ -253,6 +253,11 @@ Select(util::experimental::SimdMask mask, util::experimental::stdx::where(mask, result) = a; return result; } +template +NANOVDB_FORCEINLINE util::experimental::Simd +Sqrt(util::experimental::Simd a) { + return std::experimental::sqrt(a); +} #else template __hostdev__ util::experimental::Simd @@ -277,6 +282,13 @@ Select(util::experimental::SimdMask mask, for (int i = 0; i < W; i++) r[i] = mask[i] ? a[i] : b[i]; return r; } +template +__hostdev__ util::experimental::Simd +Sqrt(util::experimental::Simd a) { + util::experimental::Simd r; + for (int i = 0; i < W; i++) r[i] = Sqrt(a[i]); + return r; +} #endif } // namespace math diff --git a/nanovdb/nanovdb/util/WenoStencil.h b/nanovdb/nanovdb/util/WenoStencil.h index 1c50b6ca5d..e3fe994ccb 100644 --- a/nanovdb/nanovdb/util/WenoStencil.h +++ b/nanovdb/nanovdb/util/WenoStencil.h @@ -215,18 +215,23 @@ class WenoStencil } // ------------------------------------------------------------------ - // extrapolate -- repair out-of-band lanes (isActive[k][i] == false) of - // values[k] with copysign(absBackground, values[innerPoint][i]). - // Active lanes are preserved. Center point (idx 0) is assumed always - // in-band and is not processed. + // extrapolate -- sign-correct out-of-band lanes (isActive[k][i] == false) + // of values[k] by multiplying with Sign(values[innerPoint][i]). Active + // lanes are preserved. Center point (idx 0) is assumed always in-band + // and is not processed. + // + // Convention: the caller must pre-load inactive lanes of values[k] with + // the sidecar slot-0 background value (which the standard NanoVDB fill + // pattern produces automatically: a missing tap resolves to index 0, + // and sidecar[0] is the background). This routine then flips the sign + // when the parent (innerPoint) is negative, leaves it alone when the + // parent is positive, and zeros the lane when the parent is exactly 0. // // Processes 18 (point, innerPoint) pairs in ascending-|d| order so the // inner point is already resolved when the outer point is touched; // sign-inheritance through |d|=1 -> |d|=2 -> |d|=3 is automatic. - // - // Requires absBackground >= 0. // ------------------------------------------------------------------ - __hostdev__ NANOVDB_FORCEINLINE void extrapolate(float absBackground); + __hostdev__ NANOVDB_FORCEINLINE void extrapolate(); // ------------------------------------------------------------------ // normSqGrad -- Godunov's norm-square of the fifth-order WENO upwind @@ -289,21 +294,25 @@ class WenoStencil // --------------------------------------------------------------------------- template __hostdev__ NANOVDB_FORCEINLINE void -WenoStencil::extrapolate(float absBackground) +WenoStencil::extrapolate() { - const ValueType absBg(absBackground); - const ValueType zero (0.f); + const ValueType zero(0.f); for (int p = 0; p < kNumPairs; ++p) { const int k = kPairs[p][0]; const int kInner = kPairs[p][1]; - // copysign(absBg, inner): +absBg if inner >= 0, else -absBg. - const MaskType isNegInner = zero > values[kInner]; - const ValueType extrap = math::Select(isNegInner, -absBg, absBg); - - // Active lanes keep their own value; inactive lanes take the extrapolated sign-corrected background. - values[k] = math::Select(isActive[k], values[k], extrap); + // values[k] *= Sign(values[kInner]): + // parent > 0 -> values[k] (already pre-loaded with +background); + // parent < 0 -> -values[k]; + // parent == 0 -> 0. + const MaskType isPosParent = values[kInner] > zero; + const MaskType isNegParent = values[kInner] < zero; + const ValueType signed_k = math::Select(isPosParent, values[k], + math::Select(isNegParent, -values[k], zero)); + + // Active lanes keep their own value; inactive lanes get the sign-corrected background. + values[k] = math::Select(isActive[k], values[k], signed_k); } } From 158e3df537bd9d43e86422159abf008b9f5f8af6 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Tue, 28 Apr 2026 11:36:43 -0500 Subject: [PATCH 59/60] WenoStencil: absorb gather as static gatherIndices(); drop LegacyStencilAccessor WenoStencil::gatherIndices(acc, center, out) is a parameter- pack-expanded helper that resolves all SIZE point indices for the voxel at @a center via Acc::getValue(center + offset). Same semantics as the old LegacyStencilAccessor's moveTo+operator[] pair, but co-located with the layout it operates on (StencilPoints) and free of class state. Drop nanovdb/util/LegacyStencilAccessor.h: the wrapper class was a thin syntactic layer over a raw ReadAccessor and a fixed offset list, with no caching beyond what ReadAccessor already provides. Without a SIMD- batch sibling to justify the abstraction tax, dissolving it to a free static helper keeps the source-of-truth (StencilPoints) and lets each consumer own its own ReadAccessor + uint64_t indices[SIZE] scratch. ex_weno_nanovdb_cpu migrated to the new helper; smoke test on taperLER.vdb stays bit-exact (max |Delta| = 9.53674e-07). Signed-off-by: Efty Sifakis --- nanovdb/nanovdb/NanoVDB.h | 2 +- nanovdb/nanovdb/examples/CMakeLists.txt | 1 - .../narrowband_stencil_cpu.cpp | 1256 ----------- .../stencil_gather_cpu.cpp | 627 ------ .../StencilGather.md | 727 ------- .../ex_weno_nanovdb_cpu/weno_nanovdb_cpu.cpp | 32 +- nanovdb/nanovdb/util/BatchAccessor.md | 1885 ----------------- nanovdb/nanovdb/util/LegacyStencilAccessor.h | 127 -- nanovdb/nanovdb/util/StencilAccessor.md | 704 ------ nanovdb/nanovdb/util/WenoStencil.h | 31 +- nanovdb/nanovdb/util/WenoStencil.md | 36 +- 11 files changed, 43 insertions(+), 5385 deletions(-) delete mode 100644 nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp delete mode 100644 nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp delete mode 100644 nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilGather.md delete mode 100644 nanovdb/nanovdb/util/BatchAccessor.md delete mode 100644 nanovdb/nanovdb/util/LegacyStencilAccessor.h delete mode 100644 nanovdb/nanovdb/util/StencilAccessor.md diff --git a/nanovdb/nanovdb/NanoVDB.h b/nanovdb/nanovdb/NanoVDB.h index dc6deb6065..8ed16899c4 100644 --- a/nanovdb/nanovdb/NanoVDB.h +++ b/nanovdb/nanovdb/NanoVDB.h @@ -4137,7 +4137,7 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafDatahasStats() ? this->lastOffset() + 3u : 0u; } __hostdev__ uint64_t getDev() const { return this->hasStats() ? this->lastOffset() + 4u : 0u; } // Default branchless; define NANOVDB_USE_BRANCHY_GETVALUE to restore the - // pre-2026 branchy form. See BatchAccessor.md §8k for rationale. + // pre-2026 branchy form. __hostdev__ uint64_t getValue(uint32_t i) const { #ifdef NANOVDB_USE_BRANCHY_GETVALUE diff --git a/nanovdb/nanovdb/examples/CMakeLists.txt b/nanovdb/nanovdb/examples/CMakeLists.txt index 6f84a514d0..1c337b7f48 100644 --- a/nanovdb/nanovdb/examples/CMakeLists.txt +++ b/nanovdb/nanovdb/examples/CMakeLists.txt @@ -123,7 +123,6 @@ endif() # End-to-end CPU WENO5 norm-square-gradient on a narrow-band level set, # with a scalar reference for correctness validation. -# (See BatchAccessor.md §11 for the full Phase-2+3 pipeline this demonstrates.) nanovdb_example(NAME "ex_weno_nanovdb_cpu" OPENVDB) if(TARGET ex_weno_nanovdb_cpu) target_compile_options(ex_weno_nanovdb_cpu PRIVATE -march=native -fopenmp-simd) diff --git a/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp b/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp deleted file mode 100644 index 96ffe4acd6..0000000000 --- a/nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/narrowband_stencil_cpu.cpp +++ /dev/null @@ -1,1256 +0,0 @@ -// Copyright Contributors to the OpenVDB Project -// SPDX-License-Identifier: Apache-2.0 - -/*! - \file narrowband_stencil_cpu.cpp - - \brief CPU stencil gather on a real narrow-band level set loaded from .vdb. - - Counterpart to ex_stencil_gather_cpu, which uses a procedurally generated - random-occupancy domain. This example instead loads an openvdb level-set - FloatGrid from disk, converts it to a NanoVDB ValueOnIndex topology grid, - and harvests the source float values into a separately-allocated sidecar - buffer. Purpose: exercise the same perf-decomposition battery on a - workload with realistic spatial coherence — narrow-band taps are mostly - close to the surface, so the valueMask.isOn(offset) branch may be more - predictable than in the random-occupancy case (see BatchAccessor.md §8j). - - Pipeline: - openvdb::io::File(path) -- disk load - -> openvdb::GridBase::Ptr -- untyped handle - -> openvdb::FloatGrid -- typed, narrow-band - -> nanovdb::tools::CreateNanoGrid -- builder - .getHandle(channels=0) -- topology only - .copyValues(sidecar.data()) -- float sidecar - -> VBM + runPrototype + runPerf (identical to ex_stencil_gather_cpu) - - The sidecar is captured but not yet consumed by any stencil path -- plumbing - only, for future "fetch values via the sidecar" work. A one-time - validation check at startup compares FloatGrid.getValue(ijk) against - sidecar[indexGrid.tree().getValue(ijk)] on ~1000 random active voxels. - - Build: - Configured via CMakeLists.txt in the parent examples/ directory. - Requires OpenVDB (for .vdb IO). No CUDA. - - Usage: - narrowband_stencil_cpu [--grid=] - [--pass=] [--threads=] - [--skip-validation] -*/ - -#include -#include -#include // CreateNanoGrid builder, openToIndexVDB -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include -#include -#include // std::abs (sidecar-stencil-extrap) -#include -#include // std::memcpy (sidecar-pass checksum) -#include -#include // std::unique_ptr -#include -#include // std::accumulate (checksum) -#include -#include - -// ============================================================ -// Constants and type aliases -// ============================================================ - -static constexpr int Log2BlockWidth = 7; -static constexpr int BlockWidth = 1 << Log2BlockWidth; // 128 -static constexpr int SIMDw = 16; // StencilAccessor batch width - -using BuildT = nanovdb::ValueOnIndex; -using GridT = nanovdb::NanoGrid; -using LeafT = nanovdb::NanoLeaf; -using CPUVBM = nanovdb::tools::VoxelBlockManager; - -using SAccT = nanovdb::StencilAccessor; -using LegacyAccT = nanovdb::LegacyStencilAccessor; - -// ============================================================ -// VDB file loading + sidecar harvest -// ============================================================ - -/// Picks the first openvdb::FloatGrid from the file (optionally by name). -/// Throws on any failure (file not found, no FloatGrid, etc.). -static openvdb::FloatGrid::Ptr -loadFloatGridFromVdb(const std::string& path, const std::string& gridName) -{ - openvdb::io::File file(path); - file.open(false); // delayed loading off - - openvdb::GridBase::Ptr base; - if (!gridName.empty()) { - if (!file.hasGrid(gridName)) - throw std::runtime_error( - "no grid named \"" + gridName + "\" in " + path); - base = file.readGrid(gridName); - } else { - // First FloatGrid wins. - openvdb::GridPtrVecPtr grids = file.getGrids(); - for (auto& g : *grids) { - if (g && g->isType()) { - base = g; // already fully loaded by getGrids() - break; - } - } - if (!base) - throw std::runtime_error("no openvdb::FloatGrid found in " + path); - } - file.close(); - - auto floatGrid = openvdb::gridPtrCast(base); - if (!floatGrid) - throw std::runtime_error("grid is not an openvdb::FloatGrid"); - return floatGrid; -} - -/// Convert an openvdb::FloatGrid into a NanoVDB ValueOnIndex topology grid -/// plus a separately-allocated std::vector sidecar, using the -/// CreateNanoGrid builder path (channels=0, no blind data in grid). -/// -/// The builder's internal mValIdx is populated by getHandle(), so the -/// subsequent copyValues() writes the FloatGrid's active voxel values into -/// the sidecar in the same order that leaf.getValue(offset) returns. -struct NarrowBandPayload { - nanovdb::GridHandle handle; - std::vector sidecar; -}; - -static NarrowBandPayload -convertToIndexGridWithSidecar(openvdb::FloatGrid& floatGrid) -{ - nanovdb::tools::CreateNanoGrid builder(floatGrid); - - NarrowBandPayload p; - p.handle = builder.template getHandle< - nanovdb::ValueOnIndex, nanovdb::HostBuffer>( - /*channels =*/ 0u, // no blind data - /*incStats =*/ false, - /*incTiles =*/ false); - - // valueCount() is only valid after getHandle with an index DstBuildT. - p.sidecar.resize(builder.valueCount()); - builder.template copyValues(p.sidecar.data()); - - // NanoVDB convention: index 0 of the sidecar holds the background value. - // copyValues does not write slot 0 (active voxel indices start at 1); - // set it explicitly so downstream code can treat sidecar[idx] as valid - // for both in-band (idx>0) and out-of-band (idx==0) taps without branching. - if (!p.sidecar.empty()) p.sidecar[0] = floatGrid.background(); - return p; -} - -/// One-time consistency check between the source FloatGrid and the -/// IndexGrid + sidecar pair. Samples N active voxels from the source, -/// verifies: floatGrid.getValue(ijk) == sidecar[indexGrid.tree().getValue(ijk)]. -/// Returns number of mismatches (0 == pass). -static uint64_t -validateSidecarOrdering(const openvdb::FloatGrid& floatGrid, - const nanovdb::NanoGrid& indexGrid, - const std::vector& sidecar, - size_t maxSamples = 1000) -{ - // Walk the source grid's active voxels; sample up to maxSamples of them. - const auto totalActive = floatGrid.activeVoxelCount(); - if (totalActive == 0) return 0; - - const size_t step = std::max(1, size_t(totalActive / maxSamples)); - auto indexAcc = indexGrid.getAccessor(); - - uint64_t checked = 0, mismatches = 0, firstReports = 0; - size_t strideCounter = 0; - - for (auto it = floatGrid.cbeginValueOn(); it; ++it) { - if ((strideCounter++ % step) != 0) continue; - - const openvdb::Coord& oc = it.getCoord(); - const nanovdb::Coord nc(oc.x(), oc.y(), oc.z()); - - const uint64_t idx = indexAcc.getValue(nc); - if (idx == 0 || idx >= sidecar.size()) { - ++mismatches; - if (firstReports++ < 5) - std::cerr << " sidecar OOB at (" << oc.x() << "," << oc.y() - << "," << oc.z() << "): idx=" << idx - << " sidecar.size=" << sidecar.size() << "\n"; - continue; - } - const float expected = it.getValue(); - const float actual = sidecar[idx]; - if (expected != actual) { - ++mismatches; - if (firstReports++ < 5) - std::cerr << " sidecar MISMATCH at (" << oc.x() << "," << oc.y() - << "," << oc.z() << "): idx=" << idx - << " expected=" << expected - << " actual=" << actual << "\n"; - } - ++checked; - if (checked >= maxSamples) break; - } - - std::cout << "Sidecar validation: checked=" << checked - << " mismatches=" << mismatches - << (mismatches == 0 ? " PASSED\n" : " FAILED\n"); - return mismatches; -} - -// ============================================================ -// Verification -// ============================================================ - -struct VerifyStats { - uint64_t laneChecks = 0; - uint64_t errors = 0; -}; - -/// Cross-validate one StencilAccessor batch against LegacyStencilAccessor. -/// -/// Active lanes (leafIndex[p] != UnusedLeafIndex): reconstruct the global -/// coordinate from (leafIndex, voxelOffset), call legacyAcc.moveTo(), and -/// compare all SIZE tap indices element-by-element. -/// -/// Inactive lanes: assert all tap slots in stencilAcc hold 0 (background index). -static void verifyStencilAccessor( - const SAccT& stencilAcc, - const uint32_t* leafIndex, - const uint16_t* voxelOffset, - int batchStart, - const LeafT* firstLeaf, - LegacyAccT& legacyAcc, - VerifyStats& stats) -{ - for (int i = 0; i < SIMDw; ++i) { - const int p = batchStart + i; - const uint32_t li = leafIndex[p]; - - if (li == CPUVBM::UnusedLeafIndex) { - // Inactive lane: all tap slots must hold 0 (NanoVDB background index). - for (int k = 0; k < stencilAcc.size(); ++k) { - ++stats.laneChecks; - const uint64_t got = stencilAcc.mIndices[k][i]; - if (got != 0) { - ++stats.errors; - if (stats.errors <= 10) - std::cerr << "STENCIL inactive lane=" << i - << " tap=" << k - << ": expected 0, got " << got << "\n"; - } - } - continue; - } - - // Active lane: compare against the LegacyStencilAccessor oracle. - const uint16_t vo = voxelOffset[p]; - const nanovdb::Coord cOrigin = firstLeaf[li].origin(); - const int lx = (vo >> 6) & 7, ly = (vo >> 3) & 7, lz = vo & 7; - - legacyAcc.moveTo(cOrigin + nanovdb::Coord(lx, ly, lz)); - - for (int k = 0; k < stencilAcc.size(); ++k) { - ++stats.laneChecks; - const uint64_t expected = legacyAcc[k]; - const uint64_t actual = stencilAcc.mIndices[k][i]; - if (actual != expected) { - ++stats.errors; - if (stats.errors <= 10) - std::cerr << "STENCIL MISMATCH" - << " tap=" << k - << " lane=" << i - << " expected=" << expected - << " actual=" << actual << "\n"; - } - } - } -} - -// ============================================================ -// Correctness run: cross-validate StencilAccessor vs LegacyStencilAccessor -// ============================================================ - -static void runPrototype( - const GridT* grid, - const nanovdb::tools::VoxelBlockManagerHandle& vbmHandle) -{ - const LeafT* firstLeaf = grid->tree().getFirstNode<0>(); - const uint64_t nVoxels = grid->activeVoxelCount(); - const uint32_t nBlocks = (uint32_t)vbmHandle.blockCount(); - - const uint32_t* firstLeafID = vbmHandle.hostFirstLeafID(); - const uint64_t* jumpMap = vbmHandle.hostJumpMap(); - - alignas(64) uint32_t leafIndex[BlockWidth]; - alignas(64) uint16_t voxelOffset[BlockWidth]; - - // LegacyStencilAccessor owns its ReadAccessor; one instance per thread. - LegacyAccT legacyAcc(*grid); - VerifyStats stats; - - for (uint32_t bID = 0; bID < nBlocks; ++bID) { - const uint64_t blockFirstOffset = - vbmHandle.firstOffset() + (uint64_t)bID * BlockWidth; - - CPUVBM::decodeInverseMaps( - grid, firstLeafID[bID], - &jumpMap[(uint64_t)bID * CPUVBM::JumpMapLength], - blockFirstOffset, leafIndex, voxelOffset); - - int nExtraLeaves = 0; - for (int w = 0; w < CPUVBM::JumpMapLength; ++w) - nExtraLeaves += nanovdb::util::countOn( - jumpMap[(uint64_t)bID * CPUVBM::JumpMapLength + w]); - - SAccT stencilAcc(*grid, firstLeafID[bID], (uint32_t)nExtraLeaves); - - for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { - stencilAcc.moveTo(leafIndex + batchStart, voxelOffset + batchStart); - verifyStencilAccessor(stencilAcc, - leafIndex, voxelOffset, batchStart, - firstLeaf, legacyAcc, stats); - } - } - - std::cout << "Correctness (StencilAccessor vs LegacyStencilAccessor):\n" - << " blocks = " << nBlocks << "\n" - << " voxels = " << nVoxels << "\n" - << " laneChecks = " << stats.laneChecks << "\n"; - - if (stats.errors == 0) - std::cout << " PASSED\n"; - else - std::cerr << " FAILED: " << stats.errors << " mismatches\n"; -} - -// ============================================================ -// End-to-end performance comparison (multithreaded) -// -// Both paths run the full pipeline inside util::forEach: -// decodeInverseMaps → coord extraction → stencil gather → sum → store -// -// decodeInverseMaps is deliberately included: its cost is identical for -// both paths (pure cancellation in the comparison) and including it avoids -// fine-grained intra-block timing artifacts. -// -// Anti-DCE artifact: for each active voxel, accumulate the sum of all 18 -// tap uint64_t indices and write to sums[bID * BlockWidth + i]. The final -// XOR checksum is printed, forcing the compiler to materialise the stores. -// -// Timing: nanovdb::util::Timer (steady_clock) around each forEach. -// warm pass discards its measurement; only the second pass is reported. -// -// Denominator: grid->activeVoxelCount() — same for both paths. -// ============================================================ - -static void runPerf( - const GridT* grid, - const nanovdb::tools::VoxelBlockManagerHandle& vbmHandle, - const std::vector& sidecar, - const std::string& passFilter = "all") -{ - // wantPass() returns true if this pass should run under the current filter. - // Supported names: "decode", "stencil", "framing", "legacy", - // "sidecar-legacy", "sidecar-stencil", "sidecar-stencil-extrap", - // "sidecar-stencil-normsqgrad", "sidecar-transposed", - // "legacy-transposed". - // "all" runs everything. - auto wantPass = [&](const char* name) { - return passFilter == "all" || passFilter == name; - }; - - const LeafT* firstLeaf = grid->tree().getFirstNode<0>(); - const uint64_t nVoxels = grid->activeVoxelCount(); - const uint32_t nBlocks = (uint32_t)vbmHandle.blockCount(); - const uint32_t* firstLeafID = vbmHandle.hostFirstLeafID(); - const uint64_t* jumpMap = vbmHandle.hostJumpMap(); - const uint64_t firstOffset = vbmHandle.firstOffset(); - - // Anti-DCE output array. Each thread writes its own non-overlapping - // range (bID * BlockWidth ... + BlockWidth - 1) — no synchronisation needed. - std::vector sums((size_t)nBlocks * BlockWidth, 0); - - // Second sidecar for the `sidecar` pass: written at each voxel's - // VBM-sequential index (firstOffset + bID*BlockWidth + lane), which by - // construction equals the center voxel's ValueOnIndex. Sized to match - // the input sidecar so we can reuse its indexing. - std::vector outputSidecar(sidecar.size(), 0.f); - - std::ostringstream sink; // absorbs Timer's warm-pass "... " output - nanovdb::util::Timer timer; - - auto timeForEach = [&](auto&& body) -> double { - // warm pass - timer.start("", sink); - body(); - timer.elapsed(); - // timed pass - timer.start("", sink); - body(); - return static_cast(timer.elapsed()); - }; - - // ---- decodeInverseMaps-only baseline (both paths pay this cost) ---- - // Anti-DCE: XOR one uint64_t per block derived from leafIndex[] + voxelOffset[] - // so the compiler can't elide the decode work. - double decodeUs = 0.0; - if (wantPass("decode")) decodeUs = timeForEach([&] { - nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), - [&](const nanovdb::util::Range1D& range) { - alignas(64) uint32_t leafIndex[BlockWidth]; - alignas(64) uint16_t voxelOffset[BlockWidth]; - - for (size_t bID = range.begin(); bID != range.end(); ++bID) { - CPUVBM::decodeInverseMaps( - grid, firstLeafID[bID], - &jumpMap[bID * CPUVBM::JumpMapLength], - firstOffset + bID * BlockWidth, - leafIndex, voxelOffset); - - uint64_t acc = 0; - for (int i = 0; i < BlockWidth; ++i) - acc ^= (uint64_t(leafIndex[i]) << 16) | uint64_t(voxelOffset[i]); - sums[bID * BlockWidth] = acc; // one slot per block as anti-DCE - } - }); - }); - - // ---- StencilAccessor ---- - double stencilUs = 0.0; - uint64_t stencilChecksum = 0; - if (wantPass("stencil")) { - std::fill(sums.begin(), sums.end(), uint64_t(0)); - - stencilUs = timeForEach([&] { - nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), - [&](const nanovdb::util::Range1D& range) { - alignas(64) uint32_t leafIndex[BlockWidth]; - alignas(64) uint16_t voxelOffset[BlockWidth]; - - for (size_t bID = range.begin(); bID != range.end(); ++bID) { - CPUVBM::decodeInverseMaps( - grid, firstLeafID[bID], - &jumpMap[bID * CPUVBM::JumpMapLength], - firstOffset + bID * BlockWidth, - leafIndex, voxelOffset); - - int nExtraLeaves = 0; - for (int w = 0; w < CPUVBM::JumpMapLength; ++w) - nExtraLeaves += nanovdb::util::countOn( - jumpMap[bID * CPUVBM::JumpMapLength + w]); - - SAccT stencilAcc(*grid, firstLeafID[bID], (uint32_t)nExtraLeaves); - uint64_t* bs = sums.data() + bID * BlockWidth; - - for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { - stencilAcc.moveTo(leafIndex + batchStart, voxelOffset + batchStart); - for (int i = 0; i < SIMDw; ++i) { - if (leafIndex[batchStart + i] == CPUVBM::UnusedLeafIndex) continue; - uint64_t s = 0; - for (int k = 0; k < SAccT::size(); ++k) - s += stencilAcc.mIndices[k][i]; - bs[batchStart + i] = s; - } - } - } - }); - }); - - stencilChecksum = - std::accumulate(sums.begin(), sums.end(), uint64_t(0), - [](uint64_t a, uint64_t b) { return a ^ b; }); - } // end wantPass("stencil") - - // ---- Legacy framing floor: loop structure + decode, no accessor call ---- - // Anti-DCE writes derive from Coord components. Subtracted from the legacy - // pass to expose the 19-tap cost proper. - double framingUs = 0.0; - if (wantPass("framing")) { - std::fill(sums.begin(), sums.end(), uint64_t(0)); - framingUs = timeForEach([&] { - nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), - [&](const nanovdb::util::Range1D& range) { - alignas(64) uint32_t leafIndex[BlockWidth]; - alignas(64) uint16_t voxelOffset[BlockWidth]; - uint64_t* bs0 = sums.data(); - - for (size_t bID = range.begin(); bID != range.end(); ++bID) { - CPUVBM::decodeInverseMaps( - grid, firstLeafID[bID], - &jumpMap[bID * CPUVBM::JumpMapLength], - firstOffset + bID * BlockWidth, - leafIndex, voxelOffset); - - uint64_t* bs = bs0 + bID * BlockWidth; - for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { - for (int i = 0; i < SIMDw; ++i) { - const int p = batchStart + i; - if (leafIndex[p] == CPUVBM::UnusedLeafIndex) continue; - const uint16_t vo = voxelOffset[p]; - const uint32_t li = leafIndex[p]; - const nanovdb::Coord cOrigin = firstLeaf[li].origin(); - const int lx = (vo >> 6) & 7, ly = (vo >> 3) & 7, lz = vo & 7; - const nanovdb::Coord center = cOrigin + nanovdb::Coord(lx, ly, lz); - // 18 trivial "taps" — no accessor call; anti-DCE via Coord components. - uint64_t s = 0; - for (int k = 0; k < LegacyAccT::size(); ++k) - s += static_cast(center.x() + center.y() + center.z() + k); - bs[p] = s; - } - } - } - }); - }); - } // end wantPass("framing") - - // ---- LegacyStencilAccessor ---- - double legacyUs = 0.0; - uint64_t legacyChecksum = 0; - if (wantPass("legacy")) { - std::fill(sums.begin(), sums.end(), uint64_t(0)); - - legacyUs = timeForEach([&] { - nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), - [&](const nanovdb::util::Range1D& range) { - alignas(64) uint32_t leafIndex[BlockWidth]; - alignas(64) uint16_t voxelOffset[BlockWidth]; - LegacyAccT legacyAcc(*grid); // one ReadAccessor per task - uint64_t* bs0 = sums.data(); - - for (size_t bID = range.begin(); bID != range.end(); ++bID) { - CPUVBM::decodeInverseMaps( - grid, firstLeafID[bID], - &jumpMap[bID * CPUVBM::JumpMapLength], - firstOffset + bID * BlockWidth, - leafIndex, voxelOffset); - - uint64_t* bs = bs0 + bID * BlockWidth; - - for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { - for (int i = 0; i < SIMDw; ++i) { - const int p = batchStart + i; - if (leafIndex[p] == CPUVBM::UnusedLeafIndex) continue; - const uint16_t vo = voxelOffset[p]; - const uint32_t li = leafIndex[p]; - const nanovdb::Coord cOrigin = firstLeaf[li].origin(); - const int lx = (vo >> 6) & 7, ly = (vo >> 3) & 7, lz = vo & 7; - legacyAcc.moveTo(cOrigin + nanovdb::Coord(lx, ly, lz)); - uint64_t s = 0; - for (int k = 0; k < LegacyAccT::size(); ++k) s += legacyAcc[k]; - bs[p] = s; - } - } - } - }); - }); - - legacyChecksum = - std::accumulate(sums.begin(), sums.end(), uint64_t(0), - [](uint64_t a, uint64_t b) { return a ^ b; }); - } // end wantPass("legacy") - - // ---- sidecar-legacy: float value + bool isActive matrices via LegacyStencilAccessor ---- - // Precursor to the full WENO5 pipeline (§11 of BatchAccessor.md). Within - // each SIMDw-lane batch, assembles two per-tap arrays: - // float values[SIZE][SIMDw] -- sidecar[idx] (idx==0 -> background) - // bool isActive[SIZE][SIMDw] -- (idx != 0) - // Token op (anti-DCE, stand-in for WENO arithmetic): per active voxel, - // sum values[k][i] over taps with isActive[k][i]==true, write the result - // to outputSidecar at the voxel's VBM-sequential index. - double sidecarLegacyUs = 0.0; - uint64_t sidecarLegacyChecksum = 0; - if (wantPass("sidecar-legacy")) { - std::fill(outputSidecar.begin(), outputSidecar.end(), 0.f); - - sidecarLegacyUs = timeForEach([&] { - nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), - [&](const nanovdb::util::Range1D& range) { - alignas(64) uint32_t leafIndex[BlockWidth]; - alignas(64) uint16_t voxelOffset[BlockWidth]; - LegacyAccT legacyAcc(*grid); - - constexpr int SIZE = LegacyAccT::size(); - alignas(64) float values [SIZE][SIMDw]; - alignas(64) bool isActive[SIZE][SIMDw]; - - const float* const scIn = sidecar.data(); - float* const scOut = outputSidecar.data(); - - for (size_t bID = range.begin(); bID != range.end(); ++bID) { - CPUVBM::decodeInverseMaps( - grid, firstLeafID[bID], - &jumpMap[bID * CPUVBM::JumpMapLength], - firstOffset + bID * BlockWidth, - leafIndex, voxelOffset); - - const uint64_t blockBase = - firstOffset + (uint64_t)bID * BlockWidth; - - for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { - // Fill values[][] and isActive[][] for this batch. - for (int i = 0; i < SIMDw; ++i) { - const int p = batchStart + i; - if (leafIndex[p] == CPUVBM::UnusedLeafIndex) { - for (int k = 0; k < SIZE; ++k) { - values[k][i] = scIn[0]; - isActive[k][i] = false; - } - continue; - } - const uint16_t vo = voxelOffset[p]; - const uint32_t li = leafIndex[p]; - const nanovdb::Coord cOrigin = firstLeaf[li].origin(); - const int lx = (vo >> 6) & 7, ly = (vo >> 3) & 7, lz = vo & 7; - legacyAcc.moveTo(cOrigin + nanovdb::Coord(lx, ly, lz)); - for (int k = 0; k < SIZE; ++k) { - const uint64_t idx = legacyAcc[k]; - values[k][i] = scIn[idx]; // scIn[0] == background - isActive[k][i] = (idx != 0); - } - } - - // Token op: sum values for Active taps per voxel. - for (int i = 0; i < SIMDw; ++i) { - const int p = batchStart + i; - if (leafIndex[p] == CPUVBM::UnusedLeafIndex) continue; - float sum = 0.f; - for (int k = 0; k < SIZE; ++k) - if (isActive[k][i]) sum += values[k][i]; - scOut[blockBase + p] = sum; - } - } - } - }); - }); - - // Anti-DCE checksum: XOR of the float bit patterns across the full - // output sidecar. Zero-initialised slots contribute 0 (XOR identity), - // so inactive voxels don't disturb the result. - sidecarLegacyChecksum = - std::accumulate(outputSidecar.begin(), outputSidecar.end(), uint64_t(0), - [](uint64_t a, float b) { - uint32_t bits; - std::memcpy(&bits, &b, sizeof(bits)); - return a ^ uint64_t(bits); - }); - } // end wantPass("sidecar-legacy") - - // ---- sidecar-stencil: same matrices via StencilAccessor (hybrid SIMD+scalar) ---- - // Uses StencilAccessor's mIndices[SIZE][SIMDw] — the result of its SIMD - // direction-decode + scalar leaf.getValue() tail — directly as the - // uint64 index source for the sidecar lookup. Inactive lanes have - // mIndices[k][i]=0 naturally (StencilAccessor zero-fills), so the fill - // loop has no per-lane UnusedLeafIndex guard. - double sidecarStencilUs = 0.0; - uint64_t sidecarStencilChecksum = 0; - if (wantPass("sidecar-stencil")) { - std::fill(outputSidecar.begin(), outputSidecar.end(), 0.f); - - sidecarStencilUs = timeForEach([&] { - nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), - [&](const nanovdb::util::Range1D& range) { - alignas(64) uint32_t leafIndex[BlockWidth]; - alignas(64) uint16_t voxelOffset[BlockWidth]; - - constexpr int SIZE = SAccT::size(); - alignas(64) float values [SIZE][SIMDw]; - alignas(64) bool isActive[SIZE][SIMDw]; - - const float* const scIn = sidecar.data(); - float* const scOut = outputSidecar.data(); - - for (size_t bID = range.begin(); bID != range.end(); ++bID) { - CPUVBM::decodeInverseMaps( - grid, firstLeafID[bID], - &jumpMap[bID * CPUVBM::JumpMapLength], - firstOffset + bID * BlockWidth, - leafIndex, voxelOffset); - - int nExtraLeaves = 0; - for (int w = 0; w < CPUVBM::JumpMapLength; ++w) - nExtraLeaves += nanovdb::util::countOn( - jumpMap[bID * CPUVBM::JumpMapLength + w]); - - SAccT stencilAcc(*grid, firstLeafID[bID], (uint32_t)nExtraLeaves); - const uint64_t blockBase = - firstOffset + (uint64_t)bID * BlockWidth; - - for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { - stencilAcc.moveTo(leafIndex + batchStart, voxelOffset + batchStart); - - // Tap-outer fill: StencilAccessor stores mIndices[tap][lane] - // contiguously along the lane axis, so iterating k-outer - // turns lane-inner into a 16-wide sweep over one row. - for (int k = 0; k < SIZE; ++k) { - for (int i = 0; i < SIMDw; ++i) { - const uint64_t idx = stencilAcc.mIndices[k][i]; - values[k][i] = scIn[idx]; // scIn[0] == background - isActive[k][i] = (idx != 0); - } - } - - for (int i = 0; i < SIMDw; ++i) { - const int p = batchStart + i; - if (leafIndex[p] == CPUVBM::UnusedLeafIndex) continue; - float sum = 0.f; - for (int k = 0; k < SIZE; ++k) - if (isActive[k][i]) sum += values[k][i]; - scOut[blockBase + p] = sum; - } - } - } - }); - }); - - sidecarStencilChecksum = - std::accumulate(outputSidecar.begin(), outputSidecar.end(), uint64_t(0), - [](uint64_t a, float b) { - uint32_t bits; - std::memcpy(&bits, &b, sizeof(bits)); - return a ^ uint64_t(bits); - }); - } // end wantPass("sidecar-stencil") - - // ---- sidecar-stencil-extrap: sidecar-stencil + WenoStencil::extrapolate ---- - // Same fill as sidecar-stencil, then calls WenoStencil::extrapolate - // to repair out-of-band lanes via copysign(|background|, mValues[innerTap]). - // After extrapolation, isActive is not needed for the downstream op; - // the token sum over ALL taps (active + extrapolated) is the anti-DCE - // artifact. Checksum will differ from sidecar-stencil (which summed - // active-only) — that's the expected correctness signal. - double sidecarStencilExtrapUs = 0.0; - uint64_t sidecarStencilExtrapChecksum = 0; - if (wantPass("sidecar-stencil-extrap")) { - std::fill(outputSidecar.begin(), outputSidecar.end(), 0.f); - - const float absBackground = std::abs(sidecar[0]); // sidecar[0] = floatGrid.background() - - sidecarStencilExtrapUs = timeForEach([&] { - nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), - [&](const nanovdb::util::Range1D& range) { - alignas(64) uint32_t leafIndex[BlockWidth]; - alignas(64) uint16_t voxelOffset[BlockWidth]; - - // Caller-owned fill-side scratch — scalar scatter writes from - // the sidecar land here, then a per-tap SIMD load moves the - // data into the stencil's Simd-typed compute view. - alignas(64) float raw_values[nanovdb::WenoStencil::size()][SIMDw]; - alignas(64) bool raw_active[nanovdb::WenoStencil::size()][SIMDw]; - - nanovdb::WenoStencil stencil; - constexpr int SIZE = nanovdb::WenoStencil::size(); - using FloatV = nanovdb::util::Simd ; - using MaskV = nanovdb::util::SimdMask; - - const float* const scIn = sidecar.data(); - float* const scOut = outputSidecar.data(); - - for (size_t bID = range.begin(); bID != range.end(); ++bID) { - CPUVBM::decodeInverseMaps( - grid, firstLeafID[bID], - &jumpMap[bID * CPUVBM::JumpMapLength], - firstOffset + bID * BlockWidth, - leafIndex, voxelOffset); - - int nExtraLeaves = 0; - for (int w = 0; w < CPUVBM::JumpMapLength; ++w) - nExtraLeaves += nanovdb::util::countOn( - jumpMap[bID * CPUVBM::JumpMapLength + w]); - - SAccT stencilAcc(*grid, firstLeafID[bID], (uint32_t)nExtraLeaves); - const uint64_t blockBase = - firstOffset + (uint64_t)bID * BlockWidth; - - for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { - stencilAcc.moveTo(leafIndex + batchStart, voxelOffset + batchStart); - - // Scalar scatter fill into caller-owned C arrays. - for (int k = 0; k < SIZE; ++k) { - for (int i = 0; i < SIMDw; ++i) { - const uint64_t idx = stencilAcc.mIndices[k][i]; - raw_values[k][i] = scIn[idx]; - raw_active[k][i] = (idx != 0); - } - } - - // SIMD load-per-tap into the stencil's compute view. - for (int k = 0; k < SIZE; ++k) { - stencil.values [k] = FloatV(raw_values[k], nanovdb::util::element_aligned); - stencil.isActive[k] = MaskV (raw_active[k], nanovdb::util::element_aligned); - } - - // Arithmetic — reads/writes stencil.values[] as Simd in place. - stencil.extrapolate(absBackground); - - // Token sum over all 19 taps, entirely in Simd form. - FloatV sum(0.f); - for (int k = 0; k < SIZE; ++k) sum = sum + stencil.values[k]; - - // Simd → scalar bridge at the output side, mirroring the - // fill-side bridge: SIMD store into a scratch, then per-lane - // scalar write to the output sidecar (gated by leafIndex). - alignas(64) float sum_lanes[SIMDw]; - nanovdb::util::store(sum, sum_lanes, nanovdb::util::element_aligned); - for (int i = 0; i < SIMDw; ++i) { - const int p = batchStart + i; - if (leafIndex[p] == CPUVBM::UnusedLeafIndex) continue; - scOut[blockBase + p] = sum_lanes[i]; - } - } - } - }); - }); - - sidecarStencilExtrapChecksum = - std::accumulate(outputSidecar.begin(), outputSidecar.end(), uint64_t(0), - [](uint64_t a, float b) { - uint32_t bits; - std::memcpy(&bits, &b, sizeof(bits)); - return a ^ uint64_t(bits); - }); - } // end wantPass("sidecar-stencil-extrap") - - // ---- sidecar-stencil-normsqgrad: full Phase-2+3 pipeline ---- - // load → extrapolate → normSqGrad → store. Same Phase-2 front end as - // sidecar-stencil-extrap, but the 19-tap token sum is replaced by the - // real Phase-3 arithmetic: Godunov's fifth-order WENO upwind - // norm-square gradient. The per-voxel `|∇φ|²` goes straight into the - // output sidecar — no debug intermediate. - // - // Grid voxel size from grid->voxelSize()[0] (isotropic assumption for - // narrow-band SDFs). iso = 0 (zero-crossing is the surface). - double sidecarStencilNormSqGradUs = 0.0; - uint64_t sidecarStencilNormSqGradChecksum = 0; - if (wantPass("sidecar-stencil-normsqgrad")) { - std::fill(outputSidecar.begin(), outputSidecar.end(), 0.f); - - const float absBackground = std::abs(sidecar[0]); - const float dx = float(grid->voxelSize()[0]); - - sidecarStencilNormSqGradUs = timeForEach([&] { - nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), - [&](const nanovdb::util::Range1D& range) { - alignas(64) uint32_t leafIndex[BlockWidth]; - alignas(64) uint16_t voxelOffset[BlockWidth]; - - alignas(64) float raw_values[nanovdb::WenoStencil::size()][SIMDw]; - alignas(64) bool raw_active[nanovdb::WenoStencil::size()][SIMDw]; - - nanovdb::WenoStencil stencil(dx); - constexpr int SIZE = nanovdb::WenoStencil::size(); - using FloatV = nanovdb::util::Simd ; - using MaskV = nanovdb::util::SimdMask; - - const float* const scIn = sidecar.data(); - float* const scOut = outputSidecar.data(); - - for (size_t bID = range.begin(); bID != range.end(); ++bID) { - CPUVBM::decodeInverseMaps( - grid, firstLeafID[bID], - &jumpMap[bID * CPUVBM::JumpMapLength], - firstOffset + bID * BlockWidth, - leafIndex, voxelOffset); - - int nExtraLeaves = 0; - for (int w = 0; w < CPUVBM::JumpMapLength; ++w) - nExtraLeaves += nanovdb::util::countOn( - jumpMap[bID * CPUVBM::JumpMapLength + w]); - - SAccT stencilAcc(*grid, firstLeafID[bID], (uint32_t)nExtraLeaves); - const uint64_t blockBase = - firstOffset + (uint64_t)bID * BlockWidth; - - for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { - stencilAcc.moveTo(leafIndex + batchStart, voxelOffset + batchStart); - - // Fill — scalar scatter from sidecar into caller-owned raw C arrays. - for (int k = 0; k < SIZE; ++k) { - for (int i = 0; i < SIMDw; ++i) { - const uint64_t idx = stencilAcc.mIndices[k][i]; - raw_values[k][i] = scIn[idx]; - raw_active[k][i] = (idx != 0); - } - } - - // Load — per-tap SIMD load into stencil's compute view. - for (int k = 0; k < SIZE; ++k) { - stencil.values [k] = FloatV(raw_values[k], nanovdb::util::element_aligned); - stencil.isActive[k] = MaskV (raw_active[k], nanovdb::util::element_aligned); - } - - // Phase-3 arithmetic (in-place on stencil.values[], then reduce). - stencil.extrapolate(absBackground); - const FloatV result = stencil.normSqGrad(/* iso = */ 0.f); - - // Simd → scalar bridge; per-lane scalar write to output sidecar. - alignas(64) float result_lanes[SIMDw]; - nanovdb::util::store(result, result_lanes, nanovdb::util::element_aligned); - for (int i = 0; i < SIMDw; ++i) { - const int p = batchStart + i; - if (leafIndex[p] == CPUVBM::UnusedLeafIndex) continue; - scOut[blockBase + p] = result_lanes[i]; - } - } - } - }); - }); - - sidecarStencilNormSqGradChecksum = - std::accumulate(outputSidecar.begin(), outputSidecar.end(), uint64_t(0), - [](uint64_t a, float b) { - uint32_t bits; - std::memcpy(&bits, &b, sizeof(bits)); - return a ^ uint64_t(bits); - }); - } // end wantPass("sidecar-stencil-normsqgrad") - - // ---- sidecar-transposed: tap-outer fill via direct ReadAccessor ---- - // Mirrors `legacy-transposed`'s loop structure, but instead of summing - // uint64 indices into a per-voxel accumulator, the tap-outer loop fills - // values[tap][lane] + isActive[tap][lane]. A second voxel-outer pass - // performs the same token sum as the other variants. - double sidecarXposedUs = 0.0; - uint64_t sidecarXposedChecksum = 0; - if (wantPass("sidecar-transposed")) { - std::fill(outputSidecar.begin(), outputSidecar.end(), 0.f); - - using Weno5TapsX = nanovdb::Weno5Stencil::Taps; - static constexpr int SIZEX = int(std::tuple_size_v); - - sidecarXposedUs = timeForEach([&] { - nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), - [&](const nanovdb::util::Range1D& range) { - alignas(64) uint32_t leafIndex[BlockWidth]; - alignas(64) uint16_t voxelOffset[BlockWidth]; - alignas(64) nanovdb::Coord centers[SIMDw]; - alignas(64) float values [SIZEX][SIMDw]; - alignas(64) bool isActive[SIZEX][SIMDw]; - nanovdb::ReadAccessor acc(grid->tree().root()); - - const float* const scIn = sidecar.data(); - float* const scOut = outputSidecar.data(); - - for (size_t bID = range.begin(); bID != range.end(); ++bID) { - CPUVBM::decodeInverseMaps( - grid, firstLeafID[bID], - &jumpMap[bID * CPUVBM::JumpMapLength], - firstOffset + bID * BlockWidth, - leafIndex, voxelOffset); - - const uint64_t blockBase = - firstOffset + (uint64_t)bID * BlockWidth; - - for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { - for (int i = 0; i < SIMDw; ++i) { - const int p = batchStart + i; - if (leafIndex[p] == CPUVBM::UnusedLeafIndex) continue; - const uint16_t vo = voxelOffset[p]; - const uint32_t li = leafIndex[p]; - const nanovdb::Coord cOrigin = firstLeaf[li].origin(); - centers[i] = cOrigin + nanovdb::Coord( - (vo >> 6) & 7, (vo >> 3) & 7, vo & 7); - } - - auto processTap = [&]() - [[gnu::always_inline]] - { - for (int i = 0; i < SIMDw; ++i) { - if (leafIndex[batchStart + i] == CPUVBM::UnusedLeafIndex) { - values [K][i] = scIn[0]; - isActive[K][i] = false; - continue; - } - const nanovdb::Coord c = centers[i] - + nanovdb::Coord(DI, DJ, DK); - const LeafT* leaf = acc.probeLeaf(c); - if (!leaf) { - values [K][i] = scIn[0]; - isActive[K][i] = false; - continue; - } - const uint32_t offset = (uint32_t(c[0] & 7) << 6) - | (uint32_t(c[1] & 7) << 3) - | uint32_t(c[2] & 7); - const uint64_t idx = leaf->data()->getValue(offset); - values [K][i] = scIn[idx]; - isActive[K][i] = (idx != 0); - } - }; - - [&](std::index_sequence) { - (processTap.template operator()< - int(Is), - std::tuple_element_t::di, - std::tuple_element_t::dj, - std::tuple_element_t::dk>(), ...); - }(std::make_index_sequence{}); - - for (int i = 0; i < SIMDw; ++i) { - const int p = batchStart + i; - if (leafIndex[p] == CPUVBM::UnusedLeafIndex) continue; - float sum = 0.f; - for (int k = 0; k < SIZEX; ++k) - if (isActive[k][i]) sum += values[k][i]; - scOut[blockBase + p] = sum; - } - } - } - }); - }); - - sidecarXposedChecksum = - std::accumulate(outputSidecar.begin(), outputSidecar.end(), uint64_t(0), - [](uint64_t a, float b) { - uint32_t bits; - std::memcpy(&bits, &b, sizeof(bits)); - return a ^ uint64_t(bits); - }); - } // end wantPass("sidecar-transposed") - - // ---- Legacy transposed: tap-outer, voxel-inner ---- - // Same semantics as `legacy`, reordered. For each of the 19 WENO5 taps, - // sweep all BlockWidth voxels — giving long runs of probeLeaf + getValue - // calls with the SAME compile-time tap offset but varying center voxels. - double legacyXposedUs = 0.0; - uint64_t legacyXposedChecksum = 0; - if (wantPass("legacy-transposed")) { - std::fill(sums.begin(), sums.end(), uint64_t(0)); - - using Weno5Taps = nanovdb::Weno5Stencil::Taps; - static constexpr int SIZE = int(std::tuple_size_v); - - legacyXposedUs = timeForEach([&] { - nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), - [&](const nanovdb::util::Range1D& range) { - alignas(64) uint32_t leafIndex[BlockWidth]; - alignas(64) uint16_t voxelOffset[BlockWidth]; - alignas(64) nanovdb::Coord centers[SIMDw]; - alignas(64) uint64_t s[SIMDw]; - nanovdb::ReadAccessor acc(grid->tree().root()); - uint64_t* bs0 = sums.data(); - - for (size_t bID = range.begin(); bID != range.end(); ++bID) { - CPUVBM::decodeInverseMaps( - grid, firstLeafID[bID], - &jumpMap[bID * CPUVBM::JumpMapLength], - firstOffset + bID * BlockWidth, - leafIndex, voxelOffset); - - uint64_t* bs = bs0 + bID * BlockWidth; - - for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { - for (int i = 0; i < SIMDw; ++i) { - s[i] = 0; - const int p = batchStart + i; - if (leafIndex[p] == CPUVBM::UnusedLeafIndex) continue; - const uint16_t vo = voxelOffset[p]; - const uint32_t li = leafIndex[p]; - const nanovdb::Coord cOrigin = firstLeaf[li].origin(); - centers[i] = cOrigin + nanovdb::Coord( - (vo >> 6) & 7, (vo >> 3) & 7, vo & 7); - } - - auto processTap = [&]() - [[gnu::always_inline]] - { - for (int i = 0; i < SIMDw; ++i) { - if (leafIndex[batchStart + i] == CPUVBM::UnusedLeafIndex) continue; - const nanovdb::Coord c = centers[i] - + nanovdb::Coord(DI, DJ, DK); - const LeafT* leaf = acc.probeLeaf(c); - if (!leaf) continue; - const uint32_t offset = (uint32_t(c[0] & 7) << 6) - | (uint32_t(c[1] & 7) << 3) - | uint32_t(c[2] & 7); - s[i] += leaf->data()->getValue(offset); - } - }; - - [&](std::index_sequence) { - (processTap.template operator()< - std::tuple_element_t::di, - std::tuple_element_t::dj, - std::tuple_element_t::dk>(), ...); - }(std::make_index_sequence{}); - - for (int i = 0; i < SIMDw; ++i) bs[batchStart + i] = s[i]; - } - } - }); - }); - - legacyXposedChecksum = - std::accumulate(sums.begin(), sums.end(), uint64_t(0), - [](uint64_t a, uint64_t b) { return a ^ b; }); - } // end wantPass("legacy-transposed") - - std::printf("\nEnd-to-end stencil gather (%u blocks, %lu active voxels):\n", - nBlocks, nVoxels); - std::printf(" decodeInverseMaps only: %7.1f ms (%5.1f ns/voxel)\n", - decodeUs / 1e3, decodeUs * 1e3 / double(nVoxels)); - std::printf(" Framing (no accessor) : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode]\n", - framingUs / 1e3, framingUs * 1e3 / double(nVoxels), - (framingUs - decodeUs) / 1e3); - std::printf(" StencilAccessor : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", - stencilUs / 1e3, stencilUs * 1e3 / double(nVoxels), - (stencilUs - decodeUs) / 1e3, stencilChecksum); - std::printf(" LegacyStencilAccessor : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", - legacyUs / 1e3, legacyUs * 1e3 / double(nVoxels), - (legacyUs - decodeUs) / 1e3, legacyChecksum); - std::printf(" Sidecar (legacy) : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", - sidecarLegacyUs / 1e3, sidecarLegacyUs * 1e3 / double(nVoxels), - (sidecarLegacyUs - decodeUs) / 1e3, sidecarLegacyChecksum); - std::printf(" Sidecar (stencil) : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", - sidecarStencilUs / 1e3, sidecarStencilUs * 1e3 / double(nVoxels), - (sidecarStencilUs - decodeUs) / 1e3, sidecarStencilChecksum); - std::printf(" Sidecar (stencil+extrap): %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", - sidecarStencilExtrapUs / 1e3, sidecarStencilExtrapUs * 1e3 / double(nVoxels), - (sidecarStencilExtrapUs - decodeUs) / 1e3, sidecarStencilExtrapChecksum); - std::printf(" Sidecar (+normSqGrad) : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", - sidecarStencilNormSqGradUs / 1e3, sidecarStencilNormSqGradUs * 1e3 / double(nVoxels), - (sidecarStencilNormSqGradUs - decodeUs) / 1e3, sidecarStencilNormSqGradChecksum); - std::printf(" Sidecar (transposed) : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", - sidecarXposedUs / 1e3, sidecarXposedUs * 1e3 / double(nVoxels), - (sidecarXposedUs - decodeUs) / 1e3, sidecarXposedChecksum); - std::printf(" Legacy transposed : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", - legacyXposedUs / 1e3, legacyXposedUs * 1e3 / double(nVoxels), - (legacyXposedUs - decodeUs) / 1e3, legacyXposedChecksum); - - if (stencilChecksum != legacyChecksum) - std::cerr << " WARNING: stencil/legacy checksums differ — accessor results disagree!\n"; - if (legacyChecksum != legacyXposedChecksum) - std::cerr << " WARNING: legacy/legacy-transposed checksums differ — ordering bug!\n"; - if (sidecarLegacyChecksum != sidecarStencilChecksum) - std::cerr << " WARNING: sidecar legacy/stencil checksums differ — accessor results disagree!\n"; - if (sidecarLegacyChecksum != sidecarXposedChecksum) - std::cerr << " WARNING: sidecar legacy/transposed checksums differ — ordering bug!\n"; -} - -// ============================================================ -// Entry point -// ============================================================ - -static void printUsage(const char* argv0) -{ - std::cerr - << "Usage: " << argv0 << " " - << " [--grid=]" - << " [--pass=]" - << " [--threads=]" - << " [--skip-validation]\n" - << "\n" - << " Input OpenVDB file (single FloatGrid narrow-band)\n" - << " --grid= Select grid by name (default: first FloatGrid)\n" - << " --pass= Run one perf pass:\n" - << " all (default), verify, decode, stencil,\n" - << " framing, legacy, legacy-transposed,\n" - << " sidecar-legacy, sidecar-stencil,\n" - << " sidecar-stencil-extrap,\n" - << " sidecar-stencil-normsqgrad, sidecar-transposed\n" - << " --threads= Limit TBB parallelism (0 = TBB default)\n" - << " --skip-validation Skip the sidecar ordering sanity check\n"; -} - -int main(int argc, char** argv) -{ - try { - if (argc < 2 || std::string(argv[1]) == "--help" - || std::string(argv[1]) == "-h") { - printUsage(argv[0]); - return argc < 2 ? 1 : 0; - } - - std::string vdbPath = argv[1]; - std::string gridName = ""; // --grid= - std::string passFilter = "all"; // --pass= - int nThreads = 0; // --threads=, 0 = TBB default - bool skipValidation = false; - - for (int i = 2; i < argc; ++i) { - std::string a = argv[i]; - if (a.rfind("--grid=", 0) == 0) gridName = a.substr(7); - else if (a.rfind("--pass=", 0) == 0) passFilter = a.substr(7); - else if (a.rfind("--threads=", 0) == 0) nThreads = std::stoi(a.substr(10)); - else if (a == "--skip-validation") skipValidation = true; - else { printUsage(argv[0]); return 1; } - } - - std::cout << "vdb path = " << vdbPath << "\n" - << "grid name = " << (gridName.empty() ? "(first FloatGrid)" : gridName) << "\n" - << "pass = " << passFilter << "\n" - << "threads = " << (nThreads > 0 ? std::to_string(nThreads) : std::string("(TBB default)")) << "\n"; - - // ---- OpenVDB setup and .vdb load ---- - openvdb::initialize(); - auto floatGrid = loadFloatGridFromVdb(vdbPath, gridName); - - const auto bbox = floatGrid->evalActiveVoxelBoundingBox(); - const auto dim = bbox.dim(); - const auto vsize = floatGrid->voxelSize(); - std::cout << "FloatGrid:\n" - << " name = \"" << floatGrid->getName() << "\"\n" - << " active voxels = " << floatGrid->activeVoxelCount() << "\n" - << " bbox = [" << bbox.min() << " .. " << bbox.max() << "]" - << " dim=" << dim << "\n" - << " voxel size = " << vsize << "\n" - << " background = " << floatGrid->background() << "\n"; - - // ---- Convert to NanoVDB IndexGrid + separately-allocated float sidecar ---- - auto payload = convertToIndexGridWithSidecar(*floatGrid); - auto* grid = payload.handle.grid(); - if (!grid) throw std::runtime_error("Failed to create ValueOnIndex grid"); - - const auto& tree = grid->tree(); - std::cout << "IndexGrid:\n" - << " leaves = " << tree.nodeCount(0) << "\n" - << " lower nodes = " << tree.nodeCount(1) << "\n" - << " upper nodes = " << tree.nodeCount(2) << "\n" - << " active voxels = " << grid->activeVoxelCount() << "\n" - << " valueCount = " << grid->valueCount() << "\n" - << " sidecar entries = " << payload.sidecar.size() << "\n"; - - // ---- Sidecar ordering sanity check ---- - if (!skipValidation) { - if (validateSidecarOrdering(*floatGrid, *grid, payload.sidecar) != 0) - throw std::runtime_error( - "sidecar ordering mismatch -- aborting before benchmarks"); - } - - // ---- VBM ---- - auto vbmHandle = nanovdb::tools::buildVoxelBlockManager(grid); - std::cout << "VBM:\n" - << " blocks = " << vbmHandle.blockCount() - << " (BlockWidth=" << BlockWidth << ")\n\n"; - - // TBB thread-count limit for perf measurements. - std::unique_ptr tbbLimit; - if (nThreads > 0) { - tbbLimit = std::make_unique( - tbb::global_control::max_allowed_parallelism, (size_t)nThreads); - } - - if (passFilter == "all" || passFilter == "verify") - runPrototype(grid, vbmHandle); - runPerf(grid, vbmHandle, payload.sidecar, passFilter); - } catch (const std::exception& e) { - std::cerr << "Exception: " << e.what() << "\n"; - return 1; - } - return 0; -} diff --git a/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp b/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp deleted file mode 100644 index 5adcb2de12..0000000000 --- a/nanovdb/nanovdb/examples/ex_stencil_gather_cpu/stencil_gather_cpu.cpp +++ /dev/null @@ -1,627 +0,0 @@ -// Copyright Contributors to the OpenVDB Project -// SPDX-License-Identifier: Apache-2.0 - -/*! - \file stencil_gather_cpu.cpp - - \brief CPU stencil gather: LegacyStencilAccessor vs StencilAccessor. - - Generates a random sparse domain, builds a ValueOnIndex NanoVDB grid and - a VoxelBlockManager, then runs two stencil-index gather paths side by side: - - LegacyStencilAccessor -- scalar, one voxel at a time, ReadAccessor-based. - Equivalent to OpenVDB's math/Stencils.h baseline: - path-cached tree walk per tap, per voxel. - The core comparison is the cost of path-cache - eviction: distant WENO5 taps (±3) evict the - center-leaf path, so each moveTo re-traverses - the tree multiple times per voxel. - - StencilAccessor -- SIMD batch, SIMDw=16 lanes, BatchAccessor-based. - Resolves neighbor leaves once per center-leaf run, - then accesses all taps via direct array indexing. - - runPrototype() cross-validates both paths (LegacyStencilAccessor is the oracle). - runPerf() measures moveTo throughput for each path (warm pass only, rdtsc). - - Build: - Configured via CMakeLists.txt in the parent examples/ directory. - No CUDA required; CPU-only. - - Usage: stencil_gather_cpu [ambient_voxels [occupancy]] -*/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include // std::unique_ptr -#include -#include // std::accumulate (checksum) -#include -#include - -// ============================================================ -// Constants and type aliases -// ============================================================ - -static constexpr int Log2BlockWidth = 7; -static constexpr int BlockWidth = 1 << Log2BlockWidth; // 128 -static constexpr int SIMDw = 16; // StencilAccessor batch width - -using BuildT = nanovdb::ValueOnIndex; -using GridT = nanovdb::NanoGrid; -using LeafT = nanovdb::NanoLeaf; -using CPUVBM = nanovdb::tools::VoxelBlockManager; - -using SAccT = nanovdb::StencilAccessor; -using LegacyAccT = nanovdb::LegacyStencilAccessor; - -// ============================================================ -// Test domain generation (mirrors vbm_host_cuda.cpp) -// ============================================================ - -static uint32_t coordinate_bitpack(uint32_t x) -{ - x &= 0x49249249; - x |= (x >> 2); x &= 0xc30c30c3; - x |= (x >> 4); x &= 0x0f00f00f; - x |= (x >> 8); x &= 0xff0000ff; - x |= (x >> 16); x &= 0x0000ffff; - return x; -} - -static std::vector -generateDomain(int ambient_voxels, float occupancy, uint32_t seed = 42) -{ - const int target = (int)(occupancy * (float)ambient_voxels); - std::mt19937 rng(seed); - std::uniform_int_distribution dist(0, ambient_voxels - 1); - std::vector voxmap(ambient_voxels, false); - int active = 0; - while (active < target) { - int i = dist(rng); - if (!voxmap[i]) { voxmap[i] = true; ++active; } - } - std::vector coords; - coords.reserve(active); - for (int i = 0; i < ambient_voxels; ++i) { - if (voxmap[i]) { - coords.emplace_back( - (int)coordinate_bitpack( i & 0x49249249), - (int)coordinate_bitpack((i >> 1) & 0x49249249), - (int)coordinate_bitpack((i >> 2) & 0x49249249)); - } - } - return coords; -} - -// ============================================================ -// Verification -// ============================================================ - -struct VerifyStats { - uint64_t laneChecks = 0; - uint64_t errors = 0; -}; - -/// Cross-validate one StencilAccessor batch against LegacyStencilAccessor. -/// -/// Active lanes (leafIndex[p] != UnusedLeafIndex): reconstruct the global -/// coordinate from (leafIndex, voxelOffset), call legacyAcc.moveTo(), and -/// compare all SIZE tap indices element-by-element. -/// -/// Inactive lanes: assert all tap slots in stencilAcc hold 0 (background index). -static void verifyStencilAccessor( - const SAccT& stencilAcc, - const uint32_t* leafIndex, - const uint16_t* voxelOffset, - int batchStart, - const LeafT* firstLeaf, - LegacyAccT& legacyAcc, - VerifyStats& stats) -{ - for (int i = 0; i < SIMDw; ++i) { - const int p = batchStart + i; - const uint32_t li = leafIndex[p]; - - if (li == CPUVBM::UnusedLeafIndex) { - // Inactive lane: all tap slots must hold 0 (NanoVDB background index). - for (int k = 0; k < stencilAcc.size(); ++k) { - ++stats.laneChecks; - const uint64_t got = stencilAcc.mIndices[k][i]; - if (got != 0) { - ++stats.errors; - if (stats.errors <= 10) - std::cerr << "STENCIL inactive lane=" << i - << " tap=" << k - << ": expected 0, got " << got << "\n"; - } - } - continue; - } - - // Active lane: compare against the LegacyStencilAccessor oracle. - const uint16_t vo = voxelOffset[p]; - const nanovdb::Coord cOrigin = firstLeaf[li].origin(); - const int lx = (vo >> 6) & 7, ly = (vo >> 3) & 7, lz = vo & 7; - - legacyAcc.moveTo(cOrigin + nanovdb::Coord(lx, ly, lz)); - - for (int k = 0; k < stencilAcc.size(); ++k) { - ++stats.laneChecks; - const uint64_t expected = legacyAcc[k]; - const uint64_t actual = stencilAcc.mIndices[k][i]; - if (actual != expected) { - ++stats.errors; - if (stats.errors <= 10) - std::cerr << "STENCIL MISMATCH" - << " tap=" << k - << " lane=" << i - << " expected=" << expected - << " actual=" << actual << "\n"; - } - } - } -} - -// ============================================================ -// Correctness run: cross-validate StencilAccessor vs LegacyStencilAccessor -// ============================================================ - -static void runPrototype( - const GridT* grid, - const nanovdb::tools::VoxelBlockManagerHandle& vbmHandle) -{ - const LeafT* firstLeaf = grid->tree().getFirstNode<0>(); - const uint64_t nVoxels = grid->activeVoxelCount(); - const uint32_t nBlocks = (uint32_t)vbmHandle.blockCount(); - - const uint32_t* firstLeafID = vbmHandle.hostFirstLeafID(); - const uint64_t* jumpMap = vbmHandle.hostJumpMap(); - - alignas(64) uint32_t leafIndex[BlockWidth]; - alignas(64) uint16_t voxelOffset[BlockWidth]; - - // LegacyStencilAccessor owns its ReadAccessor; one instance per thread. - LegacyAccT legacyAcc(*grid); - VerifyStats stats; - - for (uint32_t bID = 0; bID < nBlocks; ++bID) { - const uint64_t blockFirstOffset = - vbmHandle.firstOffset() + (uint64_t)bID * BlockWidth; - - CPUVBM::decodeInverseMaps( - grid, firstLeafID[bID], - &jumpMap[(uint64_t)bID * CPUVBM::JumpMapLength], - blockFirstOffset, leafIndex, voxelOffset); - - int nExtraLeaves = 0; - for (int w = 0; w < CPUVBM::JumpMapLength; ++w) - nExtraLeaves += nanovdb::util::countOn( - jumpMap[(uint64_t)bID * CPUVBM::JumpMapLength + w]); - - SAccT stencilAcc(*grid, firstLeafID[bID], (uint32_t)nExtraLeaves); - - for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { - stencilAcc.moveTo(leafIndex + batchStart, voxelOffset + batchStart); - verifyStencilAccessor(stencilAcc, - leafIndex, voxelOffset, batchStart, - firstLeaf, legacyAcc, stats); - } - } - - std::cout << "Correctness (StencilAccessor vs LegacyStencilAccessor):\n" - << " blocks = " << nBlocks << "\n" - << " voxels = " << nVoxels << "\n" - << " laneChecks = " << stats.laneChecks << "\n"; - - if (stats.errors == 0) - std::cout << " PASSED\n"; - else - std::cerr << " FAILED: " << stats.errors << " mismatches\n"; -} - -// ============================================================ -// End-to-end performance comparison (multithreaded) -// -// Both paths run the full pipeline inside util::forEach: -// decodeInverseMaps → coord extraction → stencil gather → sum → store -// -// decodeInverseMaps is deliberately included: its cost is identical for -// both paths (pure cancellation in the comparison) and including it avoids -// fine-grained intra-block timing artifacts. -// -// Anti-DCE artifact: for each active voxel, accumulate the sum of all 18 -// tap uint64_t indices and write to sums[bID * BlockWidth + i]. The final -// XOR checksum is printed, forcing the compiler to materialise the stores. -// -// Timing: nanovdb::util::Timer (steady_clock) around each forEach. -// warm pass discards its measurement; only the second pass is reported. -// -// Denominator: grid->activeVoxelCount() — same for both paths. -// ============================================================ - -static void runPerf( - const GridT* grid, - const nanovdb::tools::VoxelBlockManagerHandle& vbmHandle, - const std::string& passFilter = "all") -{ - // wantPass() returns true if this pass should run under the current filter. - // Supported names: "decode", "stencil", "framing", "legacy", - // "legacy-transposed". "all" runs everything. - auto wantPass = [&](const char* name) { - return passFilter == "all" || passFilter == name; - }; - - const LeafT* firstLeaf = grid->tree().getFirstNode<0>(); - const uint64_t nVoxels = grid->activeVoxelCount(); - const uint32_t nBlocks = (uint32_t)vbmHandle.blockCount(); - const uint32_t* firstLeafID = vbmHandle.hostFirstLeafID(); - const uint64_t* jumpMap = vbmHandle.hostJumpMap(); - const uint64_t firstOffset = vbmHandle.firstOffset(); - - // Anti-DCE output array. Each thread writes its own non-overlapping - // range (bID * BlockWidth ... + BlockWidth - 1) — no synchronisation needed. - std::vector sums((size_t)nBlocks * BlockWidth, 0); - - std::ostringstream sink; // absorbs Timer's warm-pass "... " output - nanovdb::util::Timer timer; - - auto timeForEach = [&](auto&& body) -> double { - // warm pass - timer.start("", sink); - body(); - timer.elapsed(); - // timed pass - timer.start("", sink); - body(); - return static_cast(timer.elapsed()); - }; - - // ---- decodeInverseMaps-only baseline (both paths pay this cost) ---- - // Anti-DCE: XOR one uint64_t per block derived from leafIndex[] + voxelOffset[] - // so the compiler can't elide the decode work. - double decodeUs = 0.0; - if (wantPass("decode")) decodeUs = timeForEach([&] { - nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), - [&](const nanovdb::util::Range1D& range) { - alignas(64) uint32_t leafIndex[BlockWidth]; - alignas(64) uint16_t voxelOffset[BlockWidth]; - - for (size_t bID = range.begin(); bID != range.end(); ++bID) { - CPUVBM::decodeInverseMaps( - grid, firstLeafID[bID], - &jumpMap[bID * CPUVBM::JumpMapLength], - firstOffset + bID * BlockWidth, - leafIndex, voxelOffset); - - uint64_t acc = 0; - for (int i = 0; i < BlockWidth; ++i) - acc ^= (uint64_t(leafIndex[i]) << 16) | uint64_t(voxelOffset[i]); - sums[bID * BlockWidth] = acc; // one slot per block as anti-DCE - } - }); - }); - - // ---- StencilAccessor ---- - double stencilUs = 0.0; - uint64_t stencilChecksum = 0; - if (wantPass("stencil")) { - std::fill(sums.begin(), sums.end(), uint64_t(0)); - - stencilUs = timeForEach([&] { - nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), - [&](const nanovdb::util::Range1D& range) { - alignas(64) uint32_t leafIndex[BlockWidth]; - alignas(64) uint16_t voxelOffset[BlockWidth]; - - for (size_t bID = range.begin(); bID != range.end(); ++bID) { - CPUVBM::decodeInverseMaps( - grid, firstLeafID[bID], - &jumpMap[bID * CPUVBM::JumpMapLength], - firstOffset + bID * BlockWidth, - leafIndex, voxelOffset); - - int nExtraLeaves = 0; - for (int w = 0; w < CPUVBM::JumpMapLength; ++w) - nExtraLeaves += nanovdb::util::countOn( - jumpMap[bID * CPUVBM::JumpMapLength + w]); - - SAccT stencilAcc(*grid, firstLeafID[bID], (uint32_t)nExtraLeaves); - uint64_t* bs = sums.data() + bID * BlockWidth; - - for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { - stencilAcc.moveTo(leafIndex + batchStart, voxelOffset + batchStart); - for (int i = 0; i < SIMDw; ++i) { - if (leafIndex[batchStart + i] == CPUVBM::UnusedLeafIndex) continue; - uint64_t s = 0; - for (int k = 0; k < SAccT::size(); ++k) - s += stencilAcc.mIndices[k][i]; - bs[batchStart + i] = s; - } - } - } - }); - }); - - stencilChecksum = - std::accumulate(sums.begin(), sums.end(), uint64_t(0), - [](uint64_t a, uint64_t b) { return a ^ b; }); - } // end wantPass("stencil") - - // ---- Legacy framing floor: loop structure + decode, no accessor call ---- - // Anti-DCE writes derive from Coord components. Subtracted from the legacy - // pass to expose the 19-tap cost proper. - double framingUs = 0.0; - if (wantPass("framing")) { - std::fill(sums.begin(), sums.end(), uint64_t(0)); - framingUs = timeForEach([&] { - nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), - [&](const nanovdb::util::Range1D& range) { - alignas(64) uint32_t leafIndex[BlockWidth]; - alignas(64) uint16_t voxelOffset[BlockWidth]; - uint64_t* bs0 = sums.data(); - - for (size_t bID = range.begin(); bID != range.end(); ++bID) { - CPUVBM::decodeInverseMaps( - grid, firstLeafID[bID], - &jumpMap[bID * CPUVBM::JumpMapLength], - firstOffset + bID * BlockWidth, - leafIndex, voxelOffset); - - uint64_t* bs = bs0 + bID * BlockWidth; - for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { - for (int i = 0; i < SIMDw; ++i) { - const int p = batchStart + i; - if (leafIndex[p] == CPUVBM::UnusedLeafIndex) continue; - const uint16_t vo = voxelOffset[p]; - const uint32_t li = leafIndex[p]; - const nanovdb::Coord cOrigin = firstLeaf[li].origin(); - const int lx = (vo >> 6) & 7, ly = (vo >> 3) & 7, lz = vo & 7; - const nanovdb::Coord center = cOrigin + nanovdb::Coord(lx, ly, lz); - // 18 trivial "taps" — no accessor call; anti-DCE via Coord components. - uint64_t s = 0; - for (int k = 0; k < LegacyAccT::size(); ++k) - s += static_cast(center.x() + center.y() + center.z() + k); - bs[p] = s; - } - } - } - }); - }); - } // end wantPass("framing") - - // ---- LegacyStencilAccessor ---- - double legacyUs = 0.0; - uint64_t legacyChecksum = 0; - if (wantPass("legacy")) { - std::fill(sums.begin(), sums.end(), uint64_t(0)); - - legacyUs = timeForEach([&] { - nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), - [&](const nanovdb::util::Range1D& range) { - alignas(64) uint32_t leafIndex[BlockWidth]; - alignas(64) uint16_t voxelOffset[BlockWidth]; - LegacyAccT legacyAcc(*grid); // one ReadAccessor per task - uint64_t* bs0 = sums.data(); - - for (size_t bID = range.begin(); bID != range.end(); ++bID) { - CPUVBM::decodeInverseMaps( - grid, firstLeafID[bID], - &jumpMap[bID * CPUVBM::JumpMapLength], - firstOffset + bID * BlockWidth, - leafIndex, voxelOffset); - - uint64_t* bs = bs0 + bID * BlockWidth; - - for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { - for (int i = 0; i < SIMDw; ++i) { - const int p = batchStart + i; - if (leafIndex[p] == CPUVBM::UnusedLeafIndex) continue; - const uint16_t vo = voxelOffset[p]; - const uint32_t li = leafIndex[p]; - const nanovdb::Coord cOrigin = firstLeaf[li].origin(); - const int lx = (vo >> 6) & 7, ly = (vo >> 3) & 7, lz = vo & 7; - legacyAcc.moveTo(cOrigin + nanovdb::Coord(lx, ly, lz)); - uint64_t s = 0; - for (int k = 0; k < LegacyAccT::size(); ++k) s += legacyAcc[k]; - bs[p] = s; - } - } - } - }); - }); - - legacyChecksum = - std::accumulate(sums.begin(), sums.end(), uint64_t(0), - [](uint64_t a, uint64_t b) { return a ^ b; }); - } // end wantPass("legacy") - - // ---- Legacy transposed: tap-outer, voxel-inner ---- - // Same semantics as `legacy`, reordered. For each of the 19 WENO5 taps, - // sweep all BlockWidth voxels — giving long runs of probeLeaf + getValue - // calls with the SAME compile-time tap offset but varying center voxels. - double legacyXposedUs = 0.0; - uint64_t legacyXposedChecksum = 0; - if (wantPass("legacy-transposed")) { - std::fill(sums.begin(), sums.end(), uint64_t(0)); - - using Weno5Taps = nanovdb::Weno5Stencil::Taps; - static constexpr int SIZE = int(std::tuple_size_v); - - legacyXposedUs = timeForEach([&] { - nanovdb::util::forEach(size_t(0), size_t(nBlocks), size_t(1), - [&](const nanovdb::util::Range1D& range) { - alignas(64) uint32_t leafIndex[BlockWidth]; - alignas(64) uint16_t voxelOffset[BlockWidth]; - alignas(64) nanovdb::Coord centers[SIMDw]; - alignas(64) uint64_t s[SIMDw]; - nanovdb::ReadAccessor acc(grid->tree().root()); - uint64_t* bs0 = sums.data(); - - for (size_t bID = range.begin(); bID != range.end(); ++bID) { - CPUVBM::decodeInverseMaps( - grid, firstLeafID[bID], - &jumpMap[bID * CPUVBM::JumpMapLength], - firstOffset + bID * BlockWidth, - leafIndex, voxelOffset); - - uint64_t* bs = bs0 + bID * BlockWidth; - - for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { - for (int i = 0; i < SIMDw; ++i) { - s[i] = 0; - const int p = batchStart + i; - if (leafIndex[p] == CPUVBM::UnusedLeafIndex) continue; - const uint16_t vo = voxelOffset[p]; - const uint32_t li = leafIndex[p]; - const nanovdb::Coord cOrigin = firstLeaf[li].origin(); - centers[i] = cOrigin + nanovdb::Coord( - (vo >> 6) & 7, (vo >> 3) & 7, vo & 7); - } - - auto processTap = [&]() - [[gnu::always_inline]] - { - for (int i = 0; i < SIMDw; ++i) { - if (leafIndex[batchStart + i] == CPUVBM::UnusedLeafIndex) continue; - const nanovdb::Coord c = centers[i] - + nanovdb::Coord(DI, DJ, DK); - const LeafT* leaf = acc.probeLeaf(c); - if (!leaf) continue; - const uint32_t offset = (uint32_t(c[0] & 7) << 6) - | (uint32_t(c[1] & 7) << 3) - | uint32_t(c[2] & 7); - s[i] += leaf->data()->getValue(offset); - } - }; - - [&](std::index_sequence) { - (processTap.template operator()< - std::tuple_element_t::di, - std::tuple_element_t::dj, - std::tuple_element_t::dk>(), ...); - }(std::make_index_sequence{}); - - for (int i = 0; i < SIMDw; ++i) bs[batchStart + i] = s[i]; - } - } - }); - }); - - legacyXposedChecksum = - std::accumulate(sums.begin(), sums.end(), uint64_t(0), - [](uint64_t a, uint64_t b) { return a ^ b; }); - } // end wantPass("legacy-transposed") - - std::printf("\nEnd-to-end stencil gather (%u blocks, %lu active voxels):\n", - nBlocks, nVoxels); - std::printf(" decodeInverseMaps only: %7.1f ms (%5.1f ns/voxel)\n", - decodeUs / 1e3, decodeUs * 1e3 / double(nVoxels)); - std::printf(" Framing (no accessor) : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode]\n", - framingUs / 1e3, framingUs * 1e3 / double(nVoxels), - (framingUs - decodeUs) / 1e3); - std::printf(" StencilAccessor : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", - stencilUs / 1e3, stencilUs * 1e3 / double(nVoxels), - (stencilUs - decodeUs) / 1e3, stencilChecksum); - std::printf(" LegacyStencilAccessor : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", - legacyUs / 1e3, legacyUs * 1e3 / double(nVoxels), - (legacyUs - decodeUs) / 1e3, legacyChecksum); - std::printf(" Legacy transposed : %7.1f ms (%5.1f ns/voxel) [%+5.1f ms over decode] checksum=0x%016lx\n", - legacyXposedUs / 1e3, legacyXposedUs * 1e3 / double(nVoxels), - (legacyXposedUs - decodeUs) / 1e3, legacyXposedChecksum); - - if (stencilChecksum != legacyChecksum) - std::cerr << " WARNING: stencil/legacy checksums differ — accessor results disagree!\n"; - if (legacyChecksum != legacyXposedChecksum) - std::cerr << " WARNING: legacy/legacy-transposed checksums differ — ordering bug!\n"; -} - -// ============================================================ -// Entry point -// ============================================================ - -int main(int argc, char** argv) -{ - try { - int ambient_voxels = 1024 * 1024; - float occupancy = 0.5f; - std::string passFilter = "all"; // --pass= - int nThreads = 0; // --threads=, 0 = TBB default - - if (argc > 1) ambient_voxels = std::stoi(argv[1]); - if (argc > 2) occupancy = std::stof(argv[2]); - for (int i = 3; i < argc; ++i) { - std::string a = argv[i]; - if (a.rfind("--pass=", 0) == 0) passFilter = a.substr(7); - else if (a.rfind("--threads=", 0) == 0) nThreads = std::stoi(a.substr(10)); - } - occupancy = std::max(0.0f, std::min(1.0f, occupancy)); - - std::cout << "ambient_voxels = " << ambient_voxels << "\n" - << "occupancy = " << occupancy << "\n" - << "pass = " << passFilter << "\n" - << "threads = " << (nThreads > 0 ? std::to_string(nThreads) : std::string("(TBB default)")) << "\n"; - - auto coords = generateDomain(ambient_voxels, occupancy); - std::cout << "Active voxels generated: " << coords.size() << "\n"; - - // Build a float build grid from the coordinates. - nanovdb::tools::build::Grid buildGrid(0.f); - for (const auto& coord : coords) - buildGrid.tree().setValue(coord, 1.f); - - // Convert build::Grid → NanoGrid → NanoGrid. - auto floatHandle = nanovdb::tools::createNanoGrid(buildGrid); - auto indexHandle = nanovdb::tools::createNanoGrid< - nanovdb::NanoGrid, - nanovdb::ValueOnIndex>( - *floatHandle.grid(), - 0u, // channels: no sidecar blind data - false, // includeStats - false); // includeTiles - auto* grid = indexHandle.grid(); - if (!grid) throw std::runtime_error("Failed to create ValueOnIndex grid"); - - const auto& tree = grid->tree(); - std::cout << "Leaves=" << tree.nodeCount(0) - << " Lower=" << tree.nodeCount(1) - << " Upper=" << tree.nodeCount(2) - << " Active=" << grid->activeVoxelCount() << "\n"; - - auto vbmHandle = nanovdb::tools::buildVoxelBlockManager(grid); - std::cout << "VBM blocks=" << vbmHandle.blockCount() - << " (BlockWidth=" << BlockWidth << ")\n\n"; - - // TBB thread-count limit for perf measurements. - std::unique_ptr tbbLimit; - if (nThreads > 0) { - tbbLimit = std::make_unique( - tbb::global_control::max_allowed_parallelism, (size_t)nThreads); - } - - if (passFilter == "all" || passFilter == "verify") - runPrototype(grid, vbmHandle); - runPerf(grid, vbmHandle, passFilter); - - } catch (const std::exception& e) { - std::cerr << "Exception: " << e.what() << "\n"; - return 1; - } - return 0; -} diff --git a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilGather.md b/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilGather.md deleted file mode 100644 index eb77a022f2..0000000000 --- a/nanovdb/nanovdb/examples/ex_voxelBlockManager_host_cuda/StencilGather.md +++ /dev/null @@ -1,727 +0,0 @@ -# Per-Block Stencil Gather - -This document is the design and planning reference for the per-block stencil gather -kernel — the operation that, given a built VBM, a block ID, and a user-supplied kernel -lambda, gathers stencil neighbor values for all active voxels in the block and -produces a per-voxel output array. It is written as dense, agent-consumable facts -and design decisions. - -The WENO5 19-point stencil (±3 along each axis independently) is the motivating -instantiation, but the architecture is stencil-agnostic. The stencil shape enters -as a compile-time template parameter governing the number of neighbor slots, the -neighbor leaf resolution logic, and the value fetch. The user supplies a scalar -kernel lambda that operates on the gathered values and produces the output. - ---- - -**Related design document**: `BatchAccessor.md` (same directory) — full design of the -SIMD batch leaf-neighborhood cache that provides the `prefetch` / `cachedGetValue` / -`getValue` API used by the stencil kernel in Phase 2. - ---- - -## 1. Scope and Place in the Architecture - -The stencil gather sits at the **second level** of the two-level VBM parallelism -hierarchy: - -| Level | Operation | Parallelism | -|-------|-----------|-------------| -| System | `buildVoxelBlockManager` | Threading (TBB/CUDA grid) over blocks | -| Per-block | `decodeInverseMaps` + stencil gather | SIMD/SIMT within one block | - -The stencil gather: -- **Assumes** the VBM has already been built (`firstLeafID[]`, `jumpMap[]` populated). -- **Is called** once per voxel block, from one CPU thread or one CUDA CTA. -- **Uses** no inter-block communication and holds no state beyond its call. -- **Is not** responsible for launching threads or distributing work across blocks. - That is the caller's responsibility (a future launcher, analogous to - `buildVoxelBlockManager`). - ---- - -## 2. Per-Block Execution Model - -Within one call (one CPU thread / one CUDA CTA): - -1. **Decode inverse maps** into block-local storage: - - GPU: `smem_leafIndex[BlockWidth]` / `smem_voxelOffset[BlockWidth]` in shared - memory, filled cooperatively by the CTA via `decodeInverseMaps`. - - CPU: `leafIndex[BlockWidth]` / `voxelOffset[BlockWidth]` on the stack - (cache-resident), filled by a single call to `decodeInverseMaps`. -2. **Loop over active voxels** in the block (positions where - `leafIndex[p] != UnusedLeafIndex`). -3. **For each active voxel**: resolve the neighbor leaf pointers, fetch the N - neighbor values into a local array, invoke the kernel lambda, and write the - output. - -**Key invariant on intermediate storage**: `leafIndex`, `voxelOffset`, the -neighbor-leaf-pointer structs, and the per-voxel value arrays are all scratch only. -They do not persist beyond this per-block call. The kernel output array is the -only output. - ---- - -## 3. Stencil Type as Template Parameter - -### 3a. What the Infrastructure Needs - -The gather infrastructure iterates over stencil slots `n = 0 .. N-1` and for each -needs to know the Cartesian offset `(Δx, Δy, Δz)` to look up. The pipeline is: - -``` -for n in 0..N-1: - values[n] = grid.getValue(center + StencilT::offset(n)) -``` - -This requires **index → offsets** direction: given slot index n, return `(Δx, Δy, Δz)`. - -The existing `WenoPt::idx` (NanoVDB) and `NineteenPt::idx` (OpenVDB) -go in the **opposite** direction (offsets → index) and are primarily useful to the -user writing the kernel lambda (addressing a specific neighbor by name). They are -not directly usable by the infrastructure's gather loop. - -The stencil type must therefore expose a compile-time offset table: - -```cpp -// For each slot n in [0, N), the Cartesian offset -static constexpr std::array, N> offsets; -// or equivalently a static constexpr accessor: -static constexpr std::array offset(int n); -``` - -### 3b. Relationship to BaseStencil / WenoStencil - -`nanovdb::math::BaseStencil` and `WenoStencil` couple the -stencil geometry to a grid accessor (`mAcc`) via `init()` / `moveTo()`. This coupling -is incompatible with the VBM batch gather, where the infrastructure owns the value -lookup. - -What is reusable from the existing design: -- `SIZE` / `static constexpr int SIZE` — directly useful. -- `WenoPt::idx` / `pos()` — useful to the *user's kernel lambda* - for addressing neighbors by name, but not to the gather loop itself. - -The stencil type for our template parameter is a **geometry-only descriptor** — no -accessor, no stored values. It could be a thin wrapper around the existing types, -or a new family of types alongside them. - -### 3c. Stencil Characteristics - -- **N** (`SIZE`): number of points including center. -- **Offset table**: compile-time mapping from slot index → `(Δx, Δy, Δz)`. -- **Reach R**: `max |Δ|` over all axes and all slots. Governs neighbor leaf - resolution (see §5). - -For WENO5: N=19, R=3, offsets derived from `WenoPt` specializations. - ---- - -## 4. Kernel Lambda and Output Type - -### 4a. Kernel Lambda Signature - -The user supplies a kernel lambda with signature: - -```cpp -std::array kernel(const ValueType* u); -``` - -where `u[n]` is the grid value at stencil slot `n` (i.e. `u[0]` is the center, -`u[WenoPt<1,0,0>::idx]` is the +x neighbor for WENO5, etc.). The lambda is -completely unaware of indices, leaf pointers, or SIMD lanes. - -Example — Laplacian (K=1): -```cpp -auto laplacian = [](const float* u) -> std::array { - return { -6.f*u[0] + u[GradPt<1,0,0>::idx] + u[GradPt<-1,0,0>::idx] - + u[GradPt<0,1,0>::idx] + u[GradPt<0,-1,0>::idx] - + u[GradPt<0,0,1>::idx] + u[GradPt<0,0,-1>::idx] }; -}; -``` - -Example — gradient (K=3): -```cpp -auto grad = [](const float* u) -> std::array { - return { 0.5f*(u[GradPt<1,0,0>::idx] - u[GradPt<-1,0,0>::idx]), - 0.5f*(u[GradPt<0,1,0>::idx] - u[GradPt<0,-1,0>::idx]), - 0.5f*(u[GradPt<0,0,1>::idx] - u[GradPt<0,0,-1>::idx]) }; -}; -``` - -### 4b. Output Type: std::array - -The output is always `std::array` — homogeneous in type. K=1 -degenerates naturally to the scalar case without special-casing. - -Heterogeneous output (e.g. `std::tuple`) is not needed for the typical PDE/level-set -workload: Laplacian (K=1), gradient (K=3), WENO upwind differences (K=6), curvature -components (K=2) are all uniform in type. A tuple would also defeat auto-vectorization. - -### 4c. Output Buffer Layout - -The per-block output is stored in SoA layout: - -``` -results[k][BlockWidth] for k = 0 .. K-1 -``` - -Each channel `k` is a contiguous array of `ValueType` across all BlockWidth voxel -positions, mapping cleanly to K independent SIMD registers. AoS layout -(`results[BlockWidth][K]`) would interleave channels and defeat SIMD. - -K is either deduced from the lambda's return type or supplied as an explicit template -parameter. - ---- - -## 5. Neighbor Leaf Resolution - -### 5a. How Many Leaf Neighbors Per Axis - -A leaf covers 8 positions along each axis. For a stencil with reach R, a voxel at -leaf-local position p along one axis needs neighbors at p-R .. p+R. The number of -distinct leaves touched along that axis depends on where p falls within the leaf: - -- For R ≤ 3 (e.g. WENO5): at most **one** neighbor leaf per axis (either lo or hi, - never both simultaneously, for any p in [0,7]). This is because the worst case - (p=0, reach=3) reaches p-3 = -3 (one leaf back) but p+3 = 3 (still in the same - leaf). -- For R > 4: a center voxel near the middle of a leaf can require neighbors in both - the lo and the hi neighboring leaf along the same axis simultaneously. - -The current `resolveLeafPtrs` design (`ptrs[axis][0..2]`: lo/center/hi) is correct -for R ≤ 3. A more general design would use `ptrs[axis][0..K]` where K = number of -neighbor leaves per axis. - -### 5b. resolveLeafPtrs — Design - -``` -resolveLeafPtrs(grid, leaf, voxelOffset) → StencilLeafPtrs -``` - -- Performs the minimum number of `probeLeaf` calls required by the stencil shape. -- For WENO5 (R=3): exactly **3 probeLeaf calls total** (one per axis), since at most - one neighbor leaf is needed per axis. -- Returns a `StencilLeafPtrs` struct whose layout is stencil-specific (see §5). -- Intentionally scalar: `probeLeaf` is pointer-chasing and not vectorizable. - -### 5c. computeStencil — Design - -``` -computeStencil(leaf, voxelOffset, leafPtrs, data[N]) -``` - -- Fills `data[N]` with global sequential indices for all N stencil points. -- Caller must zero-initialize `data[]`; entries for out-of-narrow-band neighbors - remain 0. -- Uses the stencil's index mapping (e.g. `WenoPt::idx`) throughout — - never hardcoded integers. -- This is the auto-vectorization target for the CPU port (see §6). - ---- - -## 6. Neighbor Direction Encoding and Leaf Pointer Tables - -### 6a. Shared 3×3×3 Bit Encoding - -All stencil types use the same flat bit encoding for neighbor directions, based on -the 3×3×3 cube of immediately adjacent leaves: - -``` -bit(dx, dy, dz) = (dx+1)*9 + (dy+1)*3 + (dz+1) -``` - -where `(dx, dy, dz) ∈ {-1, 0, +1}³`. This yields 27 bits total, fitting in a -`uint32_t`. Bit 13 is the center `(0,0,0)` — always implicit, never probed. - -``` -neighborCoord(centerCoord, bit): - dx = bit/9 - 1, dy = (bit/3)%3 - 1, dz = bit%3 - 1 - return centerCoord + Coord(dx*8, dy*8, dz*8) // leaf origin offset -``` - -The six WENO5 face-neighbor bits are a strict subset of the 27: - -| Direction | (dx,dy,dz) | bit | -|-----------|-----------|-----| -| x-lo | (-1, 0, 0) | **4** | -| y-lo | ( 0,-1, 0) | **10** | -| z-lo | ( 0, 0,-1) | **12** | -| z-hi | ( 0, 0,+1) | **14** | -| y-hi | ( 0,+1, 0) | **16** | -| x-hi | (+1, 0, 0) | **22** | - -For the box stencil, all 26 non-center bits may be set. The encoding is identical; -only the set of active bits differs. - -### 6b. Common Per-Leaf Canonical State (CPU) - -State that persists across batches within one center leaf: - -```cpp -uint32_t probedMask = 0; // bit d set ↔ direction d has been probed this leaf -const LeafT* ptrs[27] = {}; // canonical neighbor table; ptrs[13] unused (center) -Coord centerLeafCoord; -``` - -`ptrs[]` is populated lazily by the `probeLeaf` loop (§8d). For WENO5, only the -six face-direction entries (bits 4,10,12,14,16,22) are ever non-null; the 21 -edge/corner entries remain null throughout. - -### 6c. Stencil-Specific Per-Batch SIMD Table - -After the probeLeaf loop fills `ptrs[27]`, the relevant entries are broadcast into a -per-lane SIMD table whose layout is stencil-specific: - -**WENO5 — `batchPtrs[4][SIMDw]`** (center + one per axis): -- `[0][i]` — center leaf (uniform broadcast of `¤tLeaf`) -- `[1][i]` — x-axis neighbor: `ptrs[4]` if `lx < R`, `ptrs[22]` if `lx >= 8-R`, else `nullptr` -- `[2][i]` — y-axis neighbor: `ptrs[10]` / `ptrs[16]` / `nullptr` -- `[3][i]` — z-axis neighbor: `ptrs[12]` / `ptrs[14]` / `nullptr` - -The broadcast is masked: a scalar `ptrs[bit]` value is written into lane `i` under -the condition that lane `i`'s local coordinate requires that direction. The -lo/hi decision is encoded in the ptr value itself — `computeStencil` does not need -to distinguish lo from hi at index-computation time. - -**Box stencil — `batchPtrs[3][3][3][SIMDw]`**: the full 27-entry cube, per lane. -Population follows the same masked-broadcast pattern, driven by each lane's -`(lx, ly, lz)` relative to leaf boundaries. - -This compaction is the step that bridges the shared scalar probeLeaf machinery (§8d) -and the SIMD stencil index computation (§8g). - -### 6d. GPU Scalar Design (Unchanged) - -The GPU per-thread design uses `ptrs[3][3]` (axis × {lo, center, hi}) and probes -all needed directions unconditionally on entry — acceptable because each GPU thread -handles one voxel and the probe count is bounded by 3. The GPU design does not -use `probedMask` or the 27-bit encoding. Both CPU and GPU designs resolve neighbor -leaves via `probeLeaf`; the machinery diverges only in batch vs. scalar granularity. - ---- - -## 7. GPU Inner Loop (Current Draft) - -After `decodeInverseMaps`, each thread with `smem_leafIndex[tID] != UnusedLeafIndex`: - -```cpp -const auto& leaf = tree.getFirstNode<0>()[smem_leafIndex[tID]]; -const uint16_t vo = smem_voxelOffset[tID]; - -uint64_t stencilData[N] = {}; -auto leafPtrs = VBM::resolveLeafPtrs(grid, leaf, vo); -VBM::computeStencil(leaf, vo, leafPtrs, stencilData); -``` - -No synchronization needed between decode and stencil steps beyond the `__syncthreads()` -already inside `decodeInverseMaps`. `resolveLeafPtrs` and `computeStencil` are both -per-thread and divergence-safe. - ---- - -## 8. CPU Inner Loop - -### 8a. SIMD Batch Width - -Process voxels in batches of `SIMDw = 16`. With AVX2 (16 × uint16_t per register), -each batch maps to one SIMD register width for `voxelOffset`. - -### 8b. Scan-Order Coherence and Expected probeLeaf Count - -NanoVDB linearizes active voxels **z-fast, y-medium, x-slow** (offset = x×64 + y×8 + z). -This means consecutive sequential active voxels vary z fastest and x slowest. The -expected intra-leaf distribution across a batch of SIMDw=16 at ~50% leaf density -(~256 active voxels per leaf): - -- 16 active voxels span ~32 scan positions → one fixed intra-leaf **x** value, - ~4 consecutive **y** values, and all 8 **z** values covered. - -This axis asymmetry determines the expected number of **unique** probeLeaf calls -per batch after deduplication: - -| Axis | Reason | Expected unique probes | -|------|--------|----------------------| -| x | All 16 voxels at same intra-leaf x; need lo or hi but not both | **≈ 0.75** | -| y | Spans ~4 y values; may straddle lo/hi boundary | **≈ 1.2** | -| z | All 8 z values present; always needs both z-lo and z-hi | **≈ 2** (deterministic) | - -**Total expected unique probeLeaf calls per batch: ~4** (well below the theoretical -maximum of 6). - -For stencils with R ≤ 3 (WENO5), a voxel at intra-leaf position p needs the -lo neighbor when p < R and the hi neighbor when p > (LeafDim - 1 - R). -For R=3, LeafDim=8: lo needed for p ∈ {0,1,2}, hi for p ∈ {5,6,7}. - -At lower leaf densities the batch spans more leaves and the expected count rises -toward 6; at higher densities it falls toward 2 (x and y each converge to 1, z stays 2). - -### 8c. ReadAccessor: Cache Behavior for probeLeaf - -The NanoVDB `DefaultReadAccessor` (`ReadAccessor`) stores -three independent single-slot caches: one per tree level (leaf/lower/upper). The -`get` dispatch checks **only the cache at `OpT::LEVEL`**, as an `if constexpr` -chain: - -```cpp -if constexpr(OpT::LEVEL <= 0) { - if (isCached(ijk)) return leaf->getAndCache(...); // leaf hit -} else if constexpr(OpT::LEVEL <= 1) { ... } // compiled away for GetLeaf - else if constexpr(OpT::LEVEL <= 2) { ... } // compiled away for GetLeaf -return mRoot->getAndCache(ijk, *this); // leaf miss → full traversal -``` - -For `GetLeaf` (LEVEL=0), the compiled code is exactly two paths: - -- **Leaf cache hit**: `isCached` check (3 masked comparisons) + return - `mNode[0]`. No memory loads beyond the accessor struct. Cost: ~6 integer - instructions, essentially free. - -- **Leaf cache miss**: falls directly to `mRoot->getAndCache` — a **full - root-to-leaf traversal**, identical in cost to `tree.probeLeaf(ijk)`. The lower - and upper node caches (`mNode[1]`, `mNode[2]`) are **not consulted** for LEVEL=0 - operations; they are populated as a side effect of the traversal but never read - back for subsequent `get` calls. - -This is a deliberate NanoVDB design choice (simpler code, better GPU SIMT behavior). -It differs from OpenVDB's `ValueAccessor3`, which does check lower/upper caches on -a leaf miss and can short-circuit traversal from a cached lower node. - -**Implications for probeLeaf in the stencil gather:** - -The ReadAccessor only helps `probeLeaf` when consecutive calls land in the **same -leaf**. For calls targeting different leaves — even adjacent leaves in the same -lower node — it is a full root traversal each time. - -**Accessor granularity:** Use **one `DefaultReadAccessor` per CPU thread**, -constructed once before the block loop and reused across all blocks and all axes. -Per-axis accessors would each pay a cold traversal for their first probe in a batch, -losing the cross-axis leaf-cache sharing (in the typical single-leaf batch, one probe -warms the leaf and all subsequent probes across any axis that happen to need the same -leaf get the hit for free). Per-block construction discards carryover between -consecutive blocks, which is wasteful since consecutive blocks process spatially -adjacent leaves. - -### 8d. Neighbor Leaf Resolution — Lazy Probe with Per-Leaf Cache - -**Why not unconditional probing:** an alternative design probes all `NUM_DIRS` -neighbor directions when the center leaf changes, caching the full pointer table -upfront. For WENO5 (6 face-neighbor directions) this is only marginally wasteful. -For the box stencil (26 directions: 6 faces + 12 edges + 8 corners), most batches -are interior and never touch edge or corner leaves; unconditionally probing all 26 -would waste ~15–20 probeLeaf calls per center leaf. - -**Why not naive per-voxel accessor use:** calling `acc.probeLeaf` for every lane -without deduplication causes leaf-cache thrashing at every y-row boundary (the cache -alternates between z-lo and z-hi at each transition). For 4 y-rows per batch, the -z-direction alone produces ~8 full traversals instead of 2. Not recommended. - -**Design: lazy probe with per-leaf `probedMask`.** - -State that persists across all batches within the same center leaf (see §6b): - -```cpp -uint32_t probedMask = 0; // 27-bit; bit = (dx+1)*9 + (dy+1)*3 + (dz+1) -const LeafT* ptrs[27] = {}; // canonical neighbor table (§6a); center implicit -Coord centerLeafCoord; -``` - -Per-batch logic — Phase 1 (probeLeaf): - -```cpp -uint32_t neededMask = computeNeededDirs(voxelOffset_batch, laneMask); // §8e -uint32_t toProbe = neededMask & ~probedMask; // needed AND not yet cached - -while (toProbe) { - int d = __builtin_ctz(toProbe); // position of lowest set bit - ptrs[d] = acc.get(neighborCoord(centerLeafCoord, d)); - probedMask |= (1u << d); - toProbe &= toProbe - 1; // clear lowest set bit -} -``` - -Per-batch — Phase 2 (populate stencil-specific `batchPtrs` from `ptrs[27]`): - -```cpp -// WENO5 example: -const LeafT* batchPtrs[4][SIMDw]; -for (int i = 0; i < SIMDw; i++) batchPtrs[0][i] = ¤tLeaf; -for (int i = 0; i < SIMDw; i++) { - int lx = voxelOffset[b+i] >> 6; - batchPtrs[1][i] = (lx < R) ? ptrs[4] : (lx >= 8-R) ? ptrs[22] : nullptr; - int ly = (voxelOffset[b+i] >> 3) & 7; - batchPtrs[2][i] = (ly < R) ? ptrs[10] : (ly >= 8-R) ? ptrs[16] : nullptr; - int lz = voxelOffset[b+i] & 7; - batchPtrs[3][i] = (lz < R) ? ptrs[12] : (lz >= 8-R) ? ptrs[14] : nullptr; -} -``` - -Then `computeStencil(batchPtrs, voxelOffset + b, data + b)` (§8g). - -On center leaf advance (`currentLeafID++`): - -```cpp -probedMask = 0; -centerLeafCoord = tree.getFirstNode<0>()[currentLeafID].origin(); -// stale ptrs[] entries are harmless; probedMask=0 guarantees re-probe before use -``` - -`probedMask` persists across batch boundaries. A direction probed during batch k -is not re-probed during batch k+1 if the center leaf has not changed. Total -probeLeaf calls per center leaf = number of distinct directions needed across all -batches in that leaf, always ≤ 26 (≤ 6 for WENO5). - -**Where the ReadAccessor genuinely earns its keep:** the `getValue` calls inside -`computeStencil` that fetch N stencil values per voxel. Many of these land in the -same leaf repeatedly. One accessor per thread, reused across the entire block loop, -accumulates leaf-cache hits throughout the computation. - -### 8e. `computeNeededDirs` — Shift-OR Carry Trick - -```cpp -// Caller builds the pre-expanded vector at the gather site: -// 1. broadcast kSentinelExpanded to all SIMDw lanes (inactive/straddle lanes stay neutral) -// 2. overwrite leafMask lanes with expandVoxelOffset(voxelOffset[b+i]) -// Then call: -uint32_t computeNeededDirs(Simd expandedVec); -``` - -Returns a bitmask of directions whose neighbor leaf is required by at least one active -lane. Purely arithmetic — no tree access, no probeLeaf. Direction bits use the §6a -encoding: `bit(dx,dy,dz) = (dx+1)*9 + (dy+1)*3 + (dz+1)`. - -**Direction encoding (WENO5, 6 active bits out of 27):** - -| Bit | Direction | (dx,dy,dz) | Condition (per lane) | -|-----|-----------|-----------|-----------------------------------| -| 4 | x-lo | (-1,0,0) | `lx < R` | -| 10 | y-lo | (0,-1,0) | `ly < R` | -| 12 | z-lo | (0,0,-1) | `lz < R` | -| 14 | z-hi | (0,0,+1) | `lz >= (8 - R)` | -| 16 | y-hi | (0,+1,0) | `ly >= (8 - R)` | -| 22 | x-hi | (+1,0,0) | `lx >= (8 - R)` | - -where `lx = vo >> 6`, `ly = (vo >> 3) & 7`, `lz = vo & 7`. - -**Algorithm — "expand, add, reduce" (single SIMD add for all 6 directions):** - -`expandVoxelOffset(vo)` packs lz, lx, ly into a 32-bit integer with 3-bit zero-guard -separators so that one carry-bit addition simultaneously tests all six directions. -Three shift-OR steps; no multiply: - -``` -e = vo -e |= (e << 9) // two packed xyz copies at 9-bit stride -e &= 0x71C7 // 0b0111_0001_1100_0111 — isolate lz@[0:2], lx@[6:8], ly@[12:14] -e |= (e << 16) // duplicate lower 15 bits to [16:30] -``` - -Target layout after expansion: - -``` -bits 0– 2 : lz (group 1 — plus-z carry exits at bit 3) -bits 3– 5 : 0 (3-bit guard) -bits 6– 8 : lx (group 2 — plus-x carry exits at bit 9) -bits 9–11 : 0 -bits 12–14 : ly (group 3 — plus-y carry exits at bit 15) -bit 15 : 0 (1-bit guard — sufficient: max carry from 3-bit + constant < 8 is 1 bit) -bits 16–18 : lz (group 4 — minus-z carry exits at bit 19) -bits 19–21 : 0 -bits 22–24 : lx (group 5 — minus-x carry exits at bit 25) -bits 25–27 : 0 -bits 28–30 : ly (group 6 — minus-y carry exits at bit 31) -``` - -`kExpandCarryK` encodes the detection threshold for all six groups in one `uint32_t` -(= 0x514530C3): - -``` -K = R | R<<6 | R<<12 | (8-R)<<16 | (8-R)<<22 | (8-R)<<28 - = 0x514530C3 (for R = 3) -``` - -Groups 1–3 receive `+R`: a field ≥ (8−R) carries (plus-direction needed). -Groups 4–6 receive `+(8−R)`: a field ≥ R carries (a CLEAR carry means minus-direction -needed — at least one lane had lc < R). - -After `result = expandedVec + kExpandCarryK` (one `vpaddd ymm` × 2): - -``` -hor_or = OR of all lanes → bit k SET ↔ plus-direction k needed (any lane) -hor_and = AND of all lanes → bit k CLEAR ↔ minus-direction k needed (any lane) - -if (hor_or & (1 << 3)) neededMask |= (1 << kHiBit[z]); // bit 14 -if (hor_or & (1 << 9)) neededMask |= (1 << kHiBit[x]); // bit 22 -if (hor_or & (1 << 15)) neededMask |= (1 << kHiBit[y]); // bit 16 -if (!(hor_and & (1 << 19))) neededMask |= (1 << kLoBit[z]); // bit 12 -if (!(hor_and & (1 << 25))) neededMask |= (1 << kLoBit[x]); // bit 4 -if (!(hor_and & (1 << 31))) neededMask |= (1 << kLoBit[y]); // bit 10 -``` - -**Sentinel for inactive/straddle lanes:** Local coordinate (4,4,4) maps to -`kInactiveVoxelOffset = 292`. Its pre-expanded form `kSentinelExpanded = 0x41044104` -satisfies: groups 1–3 sum to 4+3=7 (no carry → plus bits stay clear), groups 4–6 sum -to 4+5=9 (carry → minus bits stay set). The sentinel is broadcast at the **gather -site** — the only place where `leafMask` is known — before overwriting the active -lanes. This keeps sentinel responsibility out of `computeNeededDirs` itself. - -**Codegen (AVX2, `ex_stencil_gather_cpu`, -O3 -mavx2):** - -`computeNeededDirs` compiles to a non-inlined function of ~80 bytes with no branches -or function calls in the carry path: - -```asm -vpbroadcastd xmm0→ymm0 ; broadcast kExpandCarryK (0x514530c3) -vpaddd ymm0, [rdi], ymm1 ; add to lanes 0–7 -vpaddd ymm0, [rdi+32],ymm0 ; add to lanes 8–15 -vpor ymm1, ymm0, ymm2 ; hor_or intermediate (8 lanes) -vpand ymm1, ymm0, ymm1 ; hor_and intermediate (8 lanes) -; shuffle-tree 8→4→2→1 via vextracti128 / vpand / vpor / vpsrldq (×2) -; scalar carry-bit → neededMask decode via shl/and/test/cmov (branchless) -vzeroupper; ret -``` - -**Codegen for the gather-site loop (within `runPrototype`/`main`):** - -- `activeMask = (leafSlice != UnusedLeafIndex)`: `vpcmpeqd ymm × 4` + `vmovmskps ymm × 2` — fully vectorized. -- `leafMask = activeMask & (leafSlice == currentLeafID)`: `vpbroadcastd` + `vpcmpeqd ymm × 2` + `vmovmskps ymm × 2` + scalar AND — fully vectorized. -- Sentinel broadcast: `0x41044104` literal → `vpbroadcastd ymm` × 2 stores filling all 64 bytes. -- `expandVoxelOffset` scatter (per leafMask lane): scalar — 5 ops inlined per lane, gated by bit tests on the 16-bit bitmask. Not vectorizable due to `if (leafMask[i])` branch; dominated by downstream `probeLeaf` calls anyway. - -**Box stencil (R=1, up to 26 active bits):** face directions use the same algorithm -with different thresholds; edge and corner directions require AND of pairwise/triple -conditions and are left for future work. - -`computeNeededDirs` is the only function that encodes knowledge of the stencil's -reach R and direction-to-offset mapping. Written once per stencil shape. - -### 8f. CPU Block-Level Loop Structure - -**Block dispatch using `nExtraLeaves`.** - -`nExtraLeaves` is the popcount of the entire block's `jumpMap` — already computed -inside `decodeInverseMaps` as the loop bound for the leaf-iteration pass: - -```cpp -int nExtraLeaves = 0; -for (int i = 0; i < JumpMapLength; i++) - nExtraLeaves += util::countOn(jumpMap[i]); -``` - -`nExtraLeaves + 1` equals the total number of center leaves touched within this -block. This value is a natural block-level dispatch condition: - -- `nExtraLeaves == 0`: entire block is single-leaf. No `currentLeafID` advances, - no straddle batches. Can specialize the inner loop to eliminate dead branches. -- `nExtraLeaves >= 1`: at least one leaf transition. At most `nExtraLeaves` straddle - batches exist; all other batches are single-leaf. - -**Loop skeleton (general path):** - -```cpp -uint32_t currentLeafID = firstLeafID; -uint32_t probedMask = 0; -const LeafT* ptrs[NUM_DIRS] = {}; -Coord centerLeafCoord = tree.getFirstNode<0>()[currentLeafID].origin(); - -for (int b = 0; b < BlockWidth; b += SIMDw) { - uint32_t activeMask = non_sentinel_mask(leafIndex + b); - if (!activeMask) continue; - - while (activeMask) { - // Which lanes belong to the current center leaf? - uint32_t leafMask = lanes_equal(leafIndex + b, currentLeafID) & activeMask; - - if (!leafMask) { - // No lanes match: advance to the next leaf - currentLeafID++; - probedMask = 0; - centerLeafCoord = tree.getFirstNode<0>()[currentLeafID].origin(); - continue; - } - - // Build pre-expanded vector at the gather site (only place leafMask is known). - // Broadcast sentinel; overwrite active lanes with real expandVoxelOffset(). - VecU32 expandedVec(kSentinelExpanded); - for (int i = 0; i < SIMDw; i++) - if (leafMask & (1 << i)) expandedVec[i] = expandVoxelOffset(voxelOffset[b+i]); - uint32_t neededMask = computeNeededDirs(expandedVec); - uint32_t toProbe = neededMask & ~probedMask; - while (toProbe) { - int d = __builtin_ctz(toProbe); - ptrs[d] = acc.get(neighborCoord(centerLeafCoord, d)); - probedMask |= (1u << d); - toProbe &= toProbe - 1; - } - - computeStencil(leafMask, ptrs, voxelOffset + b, data + b); - activeMask &= ~leafMask; - } -} -``` - -**Key invariants:** -- `currentLeafID` is monotonically non-decreasing across the entire block; it - advances at most `nExtraLeaves` times. -- `probedMask` is reset only when `currentLeafID` changes — not on every batch. - Directions probed in earlier batches stay cached. -- For single-leaf blocks, the `if (!leafMask)` branch is dead, `currentLeafID` - never changes, and `probedMask` accumulates across all batches in the block. -- For straddle batches, the `while (activeMask)` iterates twice (once per leaf - present in the batch), each time consuming its subset of lanes. - -### 8g. computeStencil Vectorization - -The outer loop over lanes (i = 0 .. SIMDw-1) calls `computeStencil` once per lane -with output into a SoA `stencilData[N][SIMDw]` array. Auto-vectorization strategy: - -- `[[clang::always_inline]]` on `computeStencil`. -- `__restrict__` on output pointers. -- `#pragma clang loop vectorize(enable) vectorize_width(16)` on the outer lane loop. -- Output via `std::array` (proven to vectorize; POD struct output - vectorizes the wrong dimension). - ---- - -## 9. Open Questions / Deferred Decisions - -- **Launcher design**: the system-level wrapper that dispatches per-block calls - (the `buildVoxelBlockManager` analogue for the stencil gather). Deferred until - the per-block kernel is validated. - -- **Stencil type definition**: the geometry-only stencil descriptor (§3) needs a - concrete C++ form — whether a new family of types, a thin wrapper around existing - `BaseStencil` specializations, or a standalone `constexpr` struct. The offset table - representation (`std::array, N>` vs a static `constexpr` accessor - function) is also TBD. - -- **K deduction vs explicit parameter**: whether K (output count) is deduced from the - lambda's return type via `decltype` / CTAD, or supplied as an explicit template - parameter alongside the stencil type. - -- **`ptrs[]` layout — GPU vs CPU divergence**: the GPU design keeps `ptrs[3][3]` - (axis × {lo,center,hi}), probing unconditionally per thread. The CPU design uses - the canonical `ptrs[27]` + `probedMask` (§6b) as common infrastructure, then - populates a stencil-specific `batchPtrs` (§6c). These two designs are intentionally - separate; no unification is needed. - -- **`nExtraLeaves` surfacing**: recomputed cheaply from the block's jumpMap after - `decodeInverseMaps` returns (popcount loop, same as the internal loop bound). - `decodeInverseMaps` API is not modified — avoids CPU/GPU asymmetry. - -- **Prototype — DONE** (`ex_stencil_gather_cpu/stencil_gather_cpu.cpp`): - Phase 1 (neighbor leaf resolution) fully implemented and verified: - - `generateDomain` + VBM build + `decodeInverseMaps` per block. - - Full §8d probeLeaf + `batchPtrs[4][SIMDw]` population with lazy `probedMask`. - - `computeNeededDirs` with shift-OR carry trick (§8e) and gather-site sentinel. - - Always-on scalar cross-check at every `computeNeededDirs` call site. - - `verifyComputeNeededDirsSentinel()`: dedicated straddle-lane sentinel unit test. - - `verifyBatchPtrs()`: end-to-end per-lane batchPtrs check against direct `probeLeaf`. - - AVX2 codegen confirmed via `objdump` for `computeNeededDirs` and all mask - operations in the outer/inner loops (see §8e Codegen notes). - - Next: implement `computeStencil` (Phase 2 — index gather) and the scalar - cross-check launcher. - -- **Generalizing beyond R ≤ 3**: the single-neighbor-per-axis assumption is baked - into the current design. Any stencil with R > 4 would require revisiting §5a and §6. diff --git a/nanovdb/nanovdb/examples/ex_weno_nanovdb_cpu/weno_nanovdb_cpu.cpp b/nanovdb/nanovdb/examples/ex_weno_nanovdb_cpu/weno_nanovdb_cpu.cpp index 71b589af72..64d20debfe 100644 --- a/nanovdb/nanovdb/examples/ex_weno_nanovdb_cpu/weno_nanovdb_cpu.cpp +++ b/nanovdb/nanovdb/examples/ex_weno_nanovdb_cpu/weno_nanovdb_cpu.cpp @@ -7,8 +7,7 @@ \brief End-to-end CPU WENO5 norm-square-gradient on a narrow-band level set, with a scalar reference for correctness validation. - Demonstrates the full Phase-2+3 pipeline that BatchAccessor.md Sec. 11 has - been leading up to: + End-to-end pipeline: VBM decode -> per-batch sidecar value assembly -> out-of-band sign-extrapolation -> SIMD Godunov WENO5 -> per-voxel |grad phi|^2 @@ -22,10 +21,8 @@ extrapolation "for free", matching our explicit extrapolate() semantics on in-the-band-typical topology. - fast : LegacyStencilAccessor gather -> WenoStencil load -> + fast : WenoStencil::gatherIndices() -> per-tap SIMD load -> extrapolate() -> normSqGrad() -> per-lane scalar store. - No hybrid SIMD StencilAccessor; voxel-outer Legacy path - for code clarity. Both passes write to the same-shape output buffer, keyed by ValueOnIndex slot; a histogram of |outputRef - outputFast| follows. @@ -46,7 +43,6 @@ #include #include #include -#include #include #include // scalar reference WenoStencil @@ -85,8 +81,6 @@ using LeafT = nanovdb::NanoLeaf; using FloatGridT = nanovdb::NanoGrid; using CPUVBM = nanovdb::tools::VoxelBlockManager; -using LegacyAccT = nanovdb::LegacyStencilAccessor>; - // ============================================================ // VDB loading and NanoVDB conversion // ============================================================ @@ -223,7 +217,7 @@ runReference(const FloatGridT& floatGrid, } // ============================================================ -// Fast pass -- LegacyStencilAccessor gather + WenoStencil compute +// Fast pass -- WenoStencil::gatherIndices + WenoStencil compute // ============================================================ // // Structure: @@ -231,7 +225,7 @@ runReference(const FloatGridT& floatGrid, // decodeInverseMaps -> leafIndex[128], voxelOffset[128] // for each batch of SIMDw voxels: // fill: scalar scatter from sidecar into raw_values[SIZE][SIMDw] -// via LegacyStencilAccessor::moveTo per voxel +// via WenoStencil::gatherIndices() per voxel // load: per-tap SIMD load into stencil.values[] / isActive[] // extrapolate (sign-fix OOB lanes in-place, Simd) // normSqGrad -> FloatV @@ -280,8 +274,9 @@ runFast(const IndexGridT& index StencilT stencil(dx); - // One LegacyStencilAccessor per TBB task (one ReadAccessor). - LegacyAccT legacyAcc(indexGrid); + // One leaf-only ReadAccessor per TBB task; cache stays warm + // across the SIZE getValue calls in WenoStencil::gatherIndices(). + nanovdb::ReadAccessor acc(indexGrid.tree().root()); const float* const scIn = sidecar.data(); float* const scOut = outputFast.data(); @@ -297,10 +292,7 @@ runFast(const IndexGridT& index firstOffset + (uint64_t)bID * BlockWidth; for (int batchStart = 0; batchStart < BlockWidth; batchStart += SIMDw) { - // -------- Fill: LegacyStencilAccessor per voxel -------- - // Voxel-outer, tap-inner inside the moveTo call - // (fillTaps unrolls the 19 tap lookups against the - // shared ReadAccessor). Zero-fill inactive lanes. + // -------- Fill: per-voxel gatherIndices + sidecar lookup -------- for (int i = 0; i < SIMDw; ++i) { const int p = batchStart + i; @@ -319,11 +311,11 @@ runFast(const IndexGridT& index const nanovdb::Coord center = cOrigin + nanovdb::Coord(lx, ly, lz); - legacyAcc.moveTo(center); + uint64_t indices[SIZE]; + nanovdb::WenoStencil::gatherIndices(acc, center, indices); for (int k = 0; k < SIZE; ++k) { - const uint64_t idx = legacyAcc[k]; - raw_values[k][i] = scIn[idx]; - raw_active[k][i] = (idx != 0); + raw_values[k][i] = scIn[indices[k]]; + raw_active[k][i] = (indices[k] != 0); } } diff --git a/nanovdb/nanovdb/util/BatchAccessor.md b/nanovdb/nanovdb/util/BatchAccessor.md deleted file mode 100644 index 6f7fa5df60..0000000000 --- a/nanovdb/nanovdb/util/BatchAccessor.md +++ /dev/null @@ -1,1885 +0,0 @@ -# BatchAccessor — SIMD Batch Leaf-Neighborhood Cache - -Design reference for `BatchAccessor.h`. Captures the full design rationale -and API contract developed alongside the `ex_stencil_gather_cpu` Phase 1 prototype. - ---- - -## 1. Motivation and Core Analogy - -NanoVDB's `DefaultReadAccessor` amortizes the cost of root-to-leaf tree traversal -by caching the path for a single voxel. When successive scalar `getValue(ijk)` calls -land in the same leaf, only the first call pays the full traversal. - -`BatchAccessor` lifts this idea one level: instead of caching the path to one leaf, -it caches the **3×3×3 neighborhood of leaf pointers** surrounding the current center -leaf. Instead of serving one voxel per call, it serves a **SIMD batch of LaneWidth -voxels** simultaneously. - -| Property | Scalar `ValueAccessor` | `BatchAccessor` | -|----------|------------------------|-----------------| -| Cache unit | Path root→leaf (3 node ptrs) | 27 neighbor leaf ptrs | -| Granularity | 1 voxel per call | LaneWidth voxels per call | -| Cache key | Voxel coordinate in cached leaf's bbox | `mCenterLeafID` | -| "Hit" condition | Next voxel in same leaf | `mProbedMask` covers needed direction | -| Eviction trigger | Implicit on any miss | Explicit: `none_of(leafMask)` | -| Hit rate guarantee | Access-pattern dependent | Structural (VBM Morton ordering) | - -The hit rate of the scalar accessor depends on the access pattern. `BatchAccessor`'s -amortization is **structural**: the VBM groups voxels by leaf, so within any batch the -center leaf is known in advance, and directions probed for batch k remain valid for all -subsequent batches in the same center leaf. - ---- - -## 2. Template Parameters - -```cpp -template -class BatchAccessor; -``` - -| Parameter | Scalar default | SIMD example | Role | -|-----------|---------------|--------------|------| -| `BuildT` | — | — | NanoVDB build type; determines `LeafT`, `TreeT` | -| `ValueT` | `float` | `Simd` | Result type of `cachedGetValue` | -| `VoxelOffsetT` | `uint16_t` | `Simd` | Compact 9-bit voxel offset within a leaf | -| `PredicateT` | `bool` | `SimdMask` | Per-lane active predicate | - -For `NanoGrid`, use `ValueT = uint64_t` (scalar) or -`ValueT = Simd` (SIMD). - -The scalar defaults allow instantiation without a SIMD library, giving a clean -scalar path for debugging and cross-validation. - -Per-lane access is provided by `nanovdb::util::simd_traits` (defined in `Simd.h`), -which works for both scalar and vector types via specialisation. - ---- - -## 3. Persistent State - -Members that persist across batches within one center leaf: - -```cpp -const GridT& mGrid; // for probeLeaf calls via mGrid.tree() -uint32_t mCenterLeafID; // index of current center leaf -Coord mCenterOrigin; // world-space origin of current center leaf -uint32_t mProbedMask; // bit 13 (center) pre-set at construction -uint32_t mNeighborLeafIDs[27]; // kNullLeafID when outside narrow band or unprobed - -const uint64_t* const mOffsetBase; // &getFirstLeaf()[0].data()->mOffset -const uint64_t* const mPrefixBase; // &getFirstLeaf()[0].data()->mPrefixSum -const uint64_t* const mMaskWordBase; // getFirstLeaf()[0].valueMask().words() -``` - -**Direction encoding** (`dir` is a `static constexpr` member): - -``` -dir(dx, dy, dz) = (dx+1)*9 + (dy+1)*3 + (dz+1) dx,dy,dz ∈ {-1,0,+1} -``` - -`mNeighborLeafIDs[27]` is a flat array indexed by `dir(dx,dy,dz)`. -`mNeighborLeafIDs[13]` (= `dir(0,0,0)`) holds the center leaf ID. -`mNeighborLeafIDs[d] = kNullLeafID` when the neighbor lies outside the narrow band or -has not yet been probed. - -```cpp -static constexpr uint32_t kNullLeafID = ~uint32_t(0); -``` - -**Why leaf IDs, not pointers:** `cachedGetValue` fetches `mOffset`, `mPrefixSum`, and -`valueMask().words()[w]` for all active lanes via SIMD gathers (§8d–§8e). The gather index -is `leaf_id × (sizeof(LeafT)/sizeof(uint64_t))`, computed as a `Simd` (see §8e). -Storing IDs enables a single flat-base gather over the contiguous leaf array; storing -pointers would require per-lane pointer arithmetic that doesn't map to `vpgatherqq`. -The `kNullLeafID` sentinel is masked out before any gather via `valid_u32` (§8e). - -**Class-level base pointers:** `mOffsetBase`, `mPrefixBase`, and `mMaskWordBase` are -`const` pointers computed once in the constructor from `getFirstLeaf()[0]`. They are -invariant over the lifetime of the accessor (the leaf array is fixed after grid construction) -and are shared across all 18 `cachedGetValue` instantiations in a WENO5 gather, avoiding -the equivalent recomputation in every call. - -**Cache advance:** when `none_of(leafMask)` fires in the outer loop: - -```cpp -void advance(uint32_t newLeafID) { - mCenterLeafID = newLeafID; - mCenterOrigin = mGrid.tree().getFirstLeaf()[newLeafID].origin(); - for (auto& id : mNeighborLeafIDs) id = kNullLeafID; - mNeighborLeafIDs[dir(0, 0, 0)] = newLeafID; - mProbedMask = (1u << dir(0, 0, 0)); -} -``` - -All 27 entries are reset to `kNullLeafID` on advance; `mProbedMask` is set to only -bit 13. `toProbe = neededMask & ~mProbedMask` therefore never returns a stale index. - ---- - -## 4. Eviction and the Straddle Problem - -In a SIMD batch, "straddle lanes" are active voxels that belong to a *later* leaf -(`leafIndex[i] != mCenterLeafID`, `leafMask[i] = false`). They do NOT trigger an -eviction — the cache is still valid for the remaining current-leaf lanes. - -Eviction fires only when `none_of(leafMask)` — no lane in the batch belongs to the -current leaf. - -`leafMask` is the accessor's **partial-hit signal** — a concept with no scalar analog. - -Straddle lanes are given the inactive sentinel voxel offset `kInactiveVoxelOffset` -(= local coordinate (4,4,4)), which is strictly interior to the leaf and generates -no false crossing detections. The outer `while (any_of(activeMask))` loop processes -one leaf ID per iteration, re-using the same SIMD batch: - -``` -while any_of(activeMask): - leafMask = activeMask & (leafIndex_vec == mCenterLeafID) - if none_of(leafMask): - acc.advance(++currentLeafID) - continue - # prefetch + cachedGetValue for leafMask lanes only - acc.prefetch<...>(vo, leafMask) - acc.cachedGetValue<...>(result, vo, leafMask) # fills leafMask lanes of result - activeMask &= ~leafMask -# all lanes now filled; call kernel once with complete result -``` - ---- - -## 5. Center Leaf Initialisation — Eager (Constructor and advance) - -`mLeafNeighbors[dir(0,0,0)]` (center) is populated **eagerly** by both the -constructor and `advance()`: - -```cpp -mNeighborLeafIDs[dir(0,0,0)] = mCenterLeafID; -mProbedMask = (1u << dir(0,0,0)); // bit 13 pre-set -``` - -The center pointer is O(1) to compute — no `probeLeaf` traversal needed — so there -is no reason to defer it. - -**Consequences:** - -- `cachedGetValue<0,0,0>` (center tap) is valid immediately after construction or - `advance()`, without any `prefetch` call. -- The SWAR `neededMask` computed inside `prefetch` never needs to include bit 13: - crossings are detected per-axis, and a lane whose tap stays in the center leaf - contributes `dir(0,0,0)` which is already in `mProbedMask` and filtered by - `toProbe = neededMask & ~mProbedMask`. -- The `if (d == dir(0,0,0))` special case is removed from the probe loop: every - direction in `toProbe` is a genuine neighbor requiring `probeLeaf`. - ---- - -## 6. API - -### 6a. Direction Helper - -```cpp -static constexpr int dir(int dx, int dy, int dz); -``` - -### 6b. Lifecycle - -```cpp -BatchAccessor(const GridT& grid, uint32_t firstLeafID); -void advance(uint32_t newLeafID); -``` - -### 6c. Tier 1a — `prefetch` - -```cpp -template -void prefetch(VoxelOffsetT vo, PredicateT leafMask); -``` - -- Computes the neighbor direction for each active lane. -- Probes at most one new leaf per unique direction per call (skips directions - already in `mProbedMask`). -- Calls `mGrid.tree().probeLeaf(coord)` directly — no `AccT` parameter. - `ReadAccessor` is not used because `probeLeaf` only hits the LEVEL=0 leaf cache, - which is never warm for neighbor leaves; the internal-node caches are bypassed - entirely for `GetLeaf` operations. -- The center direction is set from `mCenterLeafID` without `probeLeaf`. - -### 6d. Tier 1b — `cachedGetValue` - -```cpp -template -void cachedGetValue(ValueT& result, VoxelOffsetT vo, PredicateT leafMask) const; -``` - -- Fills **only the `leafMask` lanes** of `result` (by reference) via a 2-arg `where` - directly on `result` — no intermediate copy, no write-back. -- `leafMask`-clear lanes are **not touched**: values from a previous iteration are - preserved exactly as the caller left them. -- Additionally, lanes for which the tap voxel is inactive (outside the narrow band - within an existing neighbor leaf) are also not written; `result` retains whatever - default the caller initialised it to (typically 0 for a zero-initialized stencil - buffer, matching `ValueOnIndex::getValue`'s return of 0 for inactive voxels). -- This contract suits the straddle-aware outer loop: the caller declares stencil - result variables (zero-initialised) before the `while` loop, fills them - progressively across iterations, and calls the kernel once after `activeMask` is empty. -- Requires the corresponding direction to be in `mProbedMask` (asserted in debug). -- `kNullLeafID` leaf (neighbor outside the narrow band entirely) also leaves `result` - untouched, for the same reason: `maskWords = 0` → `isActive = false`. - -### 6e. Deferred - -`getValue` (lazy combined) and the runtime `nanovdb::Coord` overload -are not yet implemented. Both are additive and straightforward once the two -primitives above are validated. - ---- - -## 7. Prefetch Patterns - -### WENO5 (R=3, axis-aligned) — 6 extremal taps - -```cpp -acc.prefetch<-3, 0, 0>(vo, leafMask); -acc.prefetch<+3, 0, 0>(vo, leafMask); -acc.prefetch< 0,-3, 0>(vo, leafMask); -acc.prefetch< 0,+3, 0>(vo, leafMask); -acc.prefetch< 0, 0,-3>(vo, leafMask); -acc.prefetch< 0, 0,+3>(vo, leafMask); -// All subsequent cachedGetValue calls are pure arithmetic — no tree access. -auto u_m3 = /* ... */; acc.cachedGetValue<-3,0,0>(u_m3, vo, leafMask); -auto u_m2 = /* ... */; acc.cachedGetValue<-2,0,0>(u_m2, vo, leafMask); -// ... 19 taps total -Simd flux_x = wenoKernel(u_m3, u_m2, u_m1, u_0, u_p1, u_p2, u_p3); -``` - -### Box stencil (R=1) — 8 corner taps - -```cpp -for each (sx,sy,sz) in {±1}³: - acc.prefetch(vo, leafMask); -// then cachedGetValue for all 27 taps -``` - ---- - -## 8. Implementation Notes - -### 8a. SIMD structure of prefetch and cachedGetValue - -**`prefetch` — fully SIMD for the crossing detection, scalar only for probeLeaf** - -`prefetch` contains no per-lane scalar loop. The crossing decision uses: - -1. **SWAR expansion** (YMM throughout): `vpsllw`, `vpor`, `vpand` — maps the 9-bit - voxel offset vector into the 15-bit packed form across all LaneWidth lanes. -2. **Sentinel blend**: `vpblendvb` — applies `leafMask` in one instruction. -3. **Add**: `vpaddw` — adds the compile-time `packed_tap` across all lanes. -4. **Horizontal reductions**: `vextracti128` + `vpand`/`vpor` tree → scalar `hor_and` - / `hor_or` — unavoidable for the crossing decision, which is a single bool per axis. - -Assembly-confirmed (Release, `-O3 -mavx2`, `ex_stencil_gather_cpu`): - -``` -vmovdqu (%rbx,%rax,2),%ymm2 ; load vo (16 × uint16_t) -vpsllw $0x4,%ymm2,%ymm0 ; vo << 4 -vpor %ymm2,%ymm0,%ymm0 ; vo | (vo << 4) -vpand %ymm1,%ymm0,%ymm0 ; & 0x1C07 -vpsllw $0x2,%ymm2,%ymm1 ; vo << 2 -vpand %ymm2,%ymm1,%ymm1 ; & 0xE0 -vpor %ymm1,%ymm0,%ymm0 ; → expanded -vpblendvb %ymm1,%ymm0,%ymm6,%ymm1 ; where(leafMask, packed_lc) = expanded -vpaddw %ymm2,%ymm1,%ymm1 ; packed_sum = packed_lc + packed_tap -vextracti128 $0x1,%ymm1,%xmm2 ; \ -vpand %xmm1,%xmm2,%xmm2 ; | hor_and tree: -vpunpckhwd ... ; | 16→8→4→2→1 lanes -vpand ...; vpshufd ...; vpand .. ; | -vpextrw $0x0,%xmm1,%eax ; / scalar hor_and -``` - -After the scalar crossing check, `probeLeaf` is called at most once per unique -direction per center leaf — inherently scalar tree traversal, not per-voxel. - -**`cachedGetValue` — fully SIMD, no scalar loop** - -`cachedGetValue` is fully vectorised end-to-end. The scalar `leaf->getValue(offset)` -loop has been replaced by the gather chain described in §8e. The result is written -directly to `result` via a 2-arg `where(isActive, result) = ...` — no intermediate -variable, no write-back copy. - -### 8b. No tree accessor in prefetch - -NanoVDB's `ReadAccessor` is not passed to `prefetch`. Its LEVEL=0 leaf cache is never -warm for neighbor leaves (by definition distinct from the center leaf), and its -internal-node caches are bypassed entirely when `get` misses at LEVEL=0. -`probeLeaf` is equivalent to a direct root traversal in all non-trivial cases. - -### 8c. probeLeaf returns nullptr for missing neighbors - -`mGrid.tree().probeLeaf(coord)` returns `nullptr` when the requested coordinate lies -outside the active narrow band. `prefetch` stores `kNullLeafID` in -`mNeighborLeafIDs[d]` for those directions. `cachedGetValue` detects `kNullLeafID` -and writes `ScalarValueT(0)` for those lanes, which is correct for level-set grids -(background value = 0). The SIMD gather chain masks out `kNullLeafID` lanes via the -`valid_u32` mask before accessing any leaf data. - -### 8d. SWAR direction extraction — the base-32 multiply trick - -`cachedGetValue` must compute a **per-lane** neighbor direction `dir ∈ [0,26]` at -runtime, because for a fixed compile-time tap `(di, dj, dk)` different lanes can land -in different neighbor leaves (one lane may cross only the z-face; another may cross -x and z; another may stay in the center leaf). - -`dir` is the mixed-radix value `dir = cz + 3·cy + 9·cx` where each carry component -`cz, cy, cx ∈ {0,1,2}` encodes {underflow, in-leaf, overflow} for the z-, y-, x-axis -respectively. The carry components are already sitting inside the SWAR `packed_sum` -(see §8a / `prefetch` implementation) at bit positions [3:4], [8:9], [13:14]. - -**Step 1 — extract carry pairs into base-32 digits** - -```cpp -// mask the six carry bits, right-shift by 3 -// result layout: 0b 00xx 000 yy 000 zz (three 2-bit fields, 3-bit gaps) -auto v = (packed_sum & VoxelOffsetT(0x6318u)) >> 3; -``` - -The 3-bit gaps are not accidental: the 5-bit SWAR groups naturally give a -**base-32 representation**. With the `>> 3` shift, `v` is the 3-digit duotrigesimal -(base-32) number `0d cx·cy·cz`, where digit-k = the carry component for axis k. - -**Step 2 — re-evaluate the same digits in base 3 via a single multiply** - -```cpp -// 0d 1'3'9 = 1·32² + 3·32 + 9 = 1024 + 96 + 9 = 1129 -auto dir_vec = (v * VoxelOffsetT(1129u)) >> 10; -// bits [10:14] of the product = digit-2 of v·(0d 1'3'9) = cz + 3·cy + 9·cx = dir -``` - -**Why digit-2 of the product equals `dir`:** - -Base-32 long multiplication `(0d cx·cy·cz) × (0d 1·3·9)`: - -| Digit of product | Contributions | Max value | -|---|---|---| -| 0 | 9·cz | 18 | -| 1 | 3·cz + 9·cy | 24 | -| **2** | **cz + 3·cy + 9·cx** | **26** | -| 3 | cy + 3·cx | 8 | -| 4 | cx | 2 | - -Every digit sum is **< 32**, so **no carries propagate between base-32 digits**. -Digit 2 is therefore exact: it equals `cz + 3·cy + 9·cx = dir` with no contamination -from adjacent digits. Digit-2 occupies bits [10:14] of the integer product, which is -why `>> 10` (and an optional `& 31`) extracts it. - -**Overflow note:** `v` fits in `uint16_t` (max = 2 + 2·32 + 2·1024 = 2114), and -`v · 1129` reaches up to 2 386 706 — a 22-bit value that overflows `uint16_t`. -**No widening is required**, however: we extract bits [10:14] of the product, and those -bits sit entirely below bit 16. Masking to 16 bits removes only bits 16+, leaving -bits [10:14] intact. The `uint16_t` modular product gives the same result as the -full-width product for all valid and sentinel inputs. - -**Compile-time sanity check** (all 27 valid inputs): - -```cpp -for (int cx : {0,1,2}) for (int cy : {0,1,2}) for (int cz : {0,1,2}) { - uint32_t v = cz + 32*cy + 1024*cx; - uint32_t dir = (v * 1129u) >> 10; - assert(dir == unsigned(cz + 3*cy + 9*cx)); -} -``` - -### 8e. `cachedGetValue` gather pipeline — Steps 1–8 *(historical)* - -> **Note — this section describes the prior fully-SIMD design.** The current -> implementation uses a **hybrid SIMD → scalar-tail** design (see §8i): Step 1 -> (`d_vec`) plus a parallel local-offset extraction stay SIMD (native `__m256i` -> uint16 arithmetic with no aggregate ABI), then per-lane values are harvested -> into stack C arrays and the leaf lookup runs as a plain scalar loop calling -> `leaf.getValue(offset)` directly. Steps 2–8 below no longer appear in the -> source. The material is preserved here as the rationale behind the original -> SIMD gather chain and the baseline the hybrid was compared against. - -`cachedGetValue` recomputes `packed_sum` identically to `prefetch` (§8a), then runs -the following fully-SIMD pipeline. All types are SIMD vectors of the indicated element -type; scalar `LaneWidth==1` degrades to plain scalar types. - -``` -Step 1 — d_vec (Simd) - base-32 multiply trick (§8d): per-lane dir ∈ [0,26] - -Step 2 — tapLeafID_u32 (Simd) - gather_if(tapLeafID_u32, leafMask, mNeighborLeafIDs, d_vec) - valid_u32 = (tapLeafID_u32 != kNullLeafID) ← effective mask for steps 3–5 - -Step 3 — tapLeafOffset_i64 (Simd) - simd_cast_if(tapLeafOffset_i64, valid_u32, tapLeafID_u32) - tapLeafOffset_i64 *= kStride (kStride = sizeof(LeafT)/sizeof(uint64_t)) - - Widening to int64_t is required: uint32_t * kStride overflows for large leaf - pools (kNullLeafID = 0xFFFFFFFF). simd_cast_if writes 0 for invalid lanes, - keeping gather indices non-negative. x86 vpgatherqq treats indices as signed - int64_t, so negative values would access memory before the base pointer. - -Step 4a — offsets (Simd) - gather_if(offsets, valid_u32, mOffsetBase, tapLeafOffset_i64) - → leaf->mOffset for each valid lane - -Step 4b — prefixSums (Simd) - gather_if(prefixSums, valid_u32, mPrefixBase, tapLeafOffset_i64) - Extract field w from packed mPrefixSum: - shift = (w > 0) ? (w-1)*9 : 0 - prefixSums = (w > 0) ? (prefixSums >> shift) & 511 : 0 - - mPrefixSum packs 7 nine-bit prefix counts in one uint64_t: - field w (1..7) at bits [9*(w-1) +: 9]; field 0 is defined as 0 (empty prefix). - -Step 5 — maskWords (Simd) - mask_idx = tapLeafOffset_i64 + simd_cast(wordIdx_u16) - gather_if(maskWords, valid_u32, mMaskWordBase, mask_idx) - → valueMask().words()[w] for each valid lane - - Heterogeneous mask: valid_u32 is SimdMask applied to uint64_t data. - Implemented via MaskElemT template parameter on gather_if in Simd.h. - -Step 6 — dest_yz (Simd) - dest_yz = ((packed_sum >> 2) & 0x38) | (packed_sum & 0x07) - → ny_w*8 + nz_w (6-bit intra-word bit position, range [0,63]) - -Step 7 — activity check + truncated maskWord - voxelBit = 1u64 << dest_yz - isActive = (maskWords & voxelBit) != 0 - truncated = maskWords & (voxelBit - 1) - - ValueOnIndex::getValue returns 0 for inactive voxels (bit not set in valueMask). - Null-leaf lanes have maskWords=0, so isActive=false there too — no explicit - valid_u32 guard is needed at this step. - -Step 8 — fill result - where(isActive, result) = offsets + prefixSums + popcount(truncated) - - 2-arg where writes only active lanes; leafMask-clear and inactive-voxel lanes - are untouched. -``` - -**popcount choice:** `popcount(Simd)` uses a SWAR shift-and-add tree -(`popcount64` in `Simd.h`) rather than `__builtin_popcountll`. AVX2 lacks a -64-bit lane-wise popcount (VPOPCNTQ is AVX-512DQ); `__builtin_popcountll` maps to -the scalar `popcnt` instruction, which is not vectorisable. The SWAR tree uses only -`vpsrlq` / `vpand` / `vpaddq`, which are all AVX2-native. - -### 8f. Assembly codegen — compiler × backend × ISA matrix - -Flags: `-O3 -DNDEBUG -std=c++17 -fopenmp-simd -Wno-invalid-offsetof`. -ISA: `-mavx2` (base) or `-march=native` (i9-285K Arrow Lake, AVX2; no AVX-512). -Representative instantiation: `cachedGetValue<-3,0,0>` (x−3 tap, W=16), full Steps 1–8. - -**Backend selection:** Simd.h auto-detects `` via `__has_include`. -`-DNANOVDB_USE_STD_SIMD` is redundant when the header is present. -Use `-DNANOVDB_NO_STD_SIMD` to force the array backend. - -#### `cachedGetValue<-3,0,0>` — instruction counts - -Numbers reflect the **unmasked-gather variant** (Steps 2/4a/4b changed to `gather`; -Step 5 `maskWords` kept as `gather_if`). The `ymm`/`xmm`/`calls`/`vzup`/`vpins` -columns are from the original full measurement; `insns` and `vpgather` are -post-unmasked-gather. `—` = not separately measured. - -| Variant | ISA | insns | ymm | xmm | calls | vzup | vpgather | vpins | -|---------|-----|------:|----:|----:|------:|-----:|---------:|------:| -| GCC 13 + stdx | avx2 | 579 | 393 | 100 | 14 | 13 | 0 | 8 | -| GCC 13 + array | avx2 | 1313 | 605 | 524 | 2 | 3 | 0 | 0 | -| Clang 18 + stdx | avx2 | 828 | 530 | 470 | 1 | 2 | 0 | 62 | -| Clang 18 + array | avx2 | 1231 | 459 | 326 | 2 | 2 | 0 | 0 | -| GCC 13 + stdx | native | 641 | 393 | 100 | 14 | 13 | 0 | 8 | -| GCC 13 + array | native | 1175 | — | — | 0 | 0 | 0 | — | -| Clang 18 + stdx | native | 599 | 568 | 284 | 1 | 2 | **16** | 50 | -| Clang 18 + array | native | 1200 | — | — | — | — | **6** | — | - -`vpgather` breakdown (post-unmasked-gather): -- `clang18-stdx-native`: 4× `vpgatherdd` (Step 2: 16 lanes in 4×4) + 12× `vpgatherqq` (Steps 4a/4b/5: 4-wide ×4 chunks ×3) = 16 total -- `clang18-array-native`: 2× `vpgatherdd` + 4× `vpgatherqq` = 6 total - -#### Before/after delta — unmasked-gather change - -| Variant | ISA | insns before | insns after | Δ | vpgather before | vpgather after | -|---------|-----|------------:|------------:|--:|----------------:|---------------:| -| GCC 13 + stdx | avx2 | 641 | 579 | −62 | 0 | 0 | -| GCC 13 + array | avx2 | 1320 | 1313 | −7 | 0 | 0 | -| Clang 18 + stdx | avx2 | 795 | 828 | +33 | 0 | 0 | -| Clang 18 + array| avx2 | 1365 | 1231 | −134 | 0 | 0 | -| GCC 13 + stdx | native | 641 | 641 | 0 | 0 | 0 | -| GCC 13 + array | native | 1365 | 1175 | −190 | 0 | 0 | -| Clang 18 + stdx | native | 600 | 599 | −1 | 14 | 16 | -| Clang 18 + array| native | 1365 | 1200 | −165 | 0 | 6 | - -The `clang18-stdx-avx2` regression (+33) is expected: the unmasked `gather` path -in the stdx backend emits a slightly different `where`-free code sequence that Clang -does not fold as aggressively as the original `gather_if`. Total instruction count -is still lower than the array backend. - -#### `-mavx2 -mtune=native` equivalence - -On this machine (i9-285K Arrow Lake, no AVX-512), `-march=native` and -`-mavx2 -mtune=native` produce **identical hardware-gather emission** under Clang: - -| Variant | flags | insns | vpgdd | vpgqq | -|---------|-------|------:|------:|------:| -| Clang 18 + stdx | `-mavx2 -mtune=native` | 599 | 4 | 12 | -| Clang 18 + array | `-mavx2 -mtune=native` | 1219 | 2 | 4 | - -The difference between `-mavx2` and `-march=native` is purely the **tuning model**, -not the ISA: -- `-mavx2`: targets `mtune=generic` — conservative gather cost model, no hardware gathers. -- `-march=native` (Clang): implies `mtune=sierraforest` — knows Arrow Lake's gather - throughput, auto-vectorizer considers gathers profitable → emits `vpgatherqq`. -- `-march=native` (GCC): sets the ISA to sierraforest but keeps `mtune=generic` — - same conservative behaviour as `-mavx2`. No hardware gathers emitted by GCC even - with `-march=native`. - -GCC's stdx backend produces identical output (641 insns before / 579 after, 0 gathers) -for both `-mavx2` and `-march=native`. - -#### `prefetch<-3,0,0>` — standalone vs inlined - -| Variant | ISA | standalone symbol? | insns | -|---------|-----|--------------------|------:| -| GCC 13 + stdx | any | No — fully inlined | — | -| GCC 13 + array | avx2 | Yes | 260 | -| Clang 18 + stdx | any | No — fully inlined | — | -| Clang 18 + array| avx2 | Yes | 176 | - ---- - -**Finding 1 — stdx backend is far superior to the array backend.** -The array backend is ≈2× larger in instruction count and degrades every `gather_if` -to a scalar lane-by-lane loop: 16 `vpextrw` to extract uint16_t direction indices, 16 -conditional branches, 16 scalar uint32_t loads from `mNeighborLeafIDs`, then repeated -for each of the three uint64_t gathers (48 `vpextrq` total). In the stdx backends, -`gather_if` either maps to hardware gather instructions (Clang + native) or at worst -compact `vpinsrq` sequences (Clang + avx2). The 76 vpextr instructions (array backend) -vs 62 vpinsrb/q (stdx avx2) is telling: array is still scalar-inserting via extract, -not vectorised. The array backend also fails to inline `prefetch`. - -**Finding 2 — Clang inlines all helpers; GCC emits 14 out-of-line weak stubs.** -GCC 13 emits `gather_if`, `simd_cast`, `simd_cast_if`, `where`, and `popcount` as -out-of-line COMDAT weak symbols and calls them. Each call requires `vzeroupper` on -entry (AVX ABI), yielding 13 transitions per `cachedGetValue` invocation. Clang 18 -inlines all of them into a single function body except the final `popcount` call. - -**Finding 3 — Hardware gathers require Clang + native tuning; unmasked gathers unlock the array backend too.** -After the unmasked-gather change, `clang18-stdx-native` emits **16** hardware gathers per `cachedGetValue`: -``` -vpgatherdd — 4× for the uint32_t tapLeafID gather (Step 2: 4-wide × 4 = 16 lanes) -vpgatherqq — 12× for the three uint64_t data gathers (Steps 4a/4b/5: 4-wide × 4 each) -``` -`clang18-array-native` now emits **6** hardware gathers (2 vpgdd + 4 vpgqq) — the first -gathers ever seen in the array backend. The unmasked `for (i) dst[i] = ptr[idx[i]]` -loop is the pattern Clang's auto-vectorizer converts to `vpgatherqq`; the `if (mask[i])` -conditional in `gather_if` defeated auto-vectorization for all mask types. - -GCC 13 emits 0 hardware gathers even with `-march=native` — its stdx backend does not -exploit `vpgatherdd`/`vpgatherqq` for `experimental::simd` gather operations. With -`-mavx2` alone, Clang also falls back to software gather (62 `vpinsrq/b`). - -The 50 `vpinsrb` that remain in `clang18-stdx-native` are the mask-widening cost for -the one remaining heterogeneous `gather_if` (Step 5 `maskWords`): `SimdMask` -is widened to four `SimdMask` chunks to provide the sign-bit masks that -`vpgatherqq` expects. - -**Finding 4 — `-march=native` gains nothing for GCC, in either backend.** -GCC's stdx backend produces identical output (641/579 insns, 0 gathers) for both -`-mavx2` and `-march=native`. The array backend with `-march=native` (1175 insns, -0 gathers) also emits zero hardware gathers — even for the bare unmasked -`for (i) dst[i] = ptr[idx[i]]` loop that Clang converts to `vpgatherqq`. GCC's -auto-vectorizer cost model treats gather instructions as unprofitable regardless of -tuning, preferring 40 `vpextrq` + 16 `vpinsrq` + 65 `vmovq` (scalar lane-by-lane) -instead. This is a GCC backend policy, not a flag or mask-type issue. - -**Finding 5 — Masking was the auto-vectorizer blocker for gathers.** -`gather_if` takes an `if (mask[i]) dst[i] = ptr[idx[i]]` shape — a conditional store. -This defeats Clang's gather auto-vectorizer for every mask element type tried (bool, -uint32_t, uint64_t). The unmasked `gather` loop `dst[i] = ptr[idx[i]]` is the one -pattern that Clang + native tuning converts to `vpgatherqq`. The sentinel invariant -makes the change safe: Step 2 uses `d ∈ [0,26]` (SWAR always valid); Steps 4a/4b use -`tapLeafOffset_i64 = 0` for invalid lanes (reading from base[0], the center leaf — safe -but unused); Step 5 is kept masked so that `maskWords = 0` for invalid lanes, ensuring -`isActive = false` without a cross-width mask AND. - -**`popcount`** (out-of-line in all variants that reach it): 88 instructions, 85 ymm. -Fully vectorised with `vpsrlq`, `vpand`, `vpsubq`, `vpaddq`. Adding -`[[gnu::always_inline]]` to `util::popcount` in Simd.h eliminates the last remaining -out-of-line call in the Clang path and reduces GCC from 14 to 13 external calls. - -**Action — `[[gnu::always_inline]]` on Simd.h helpers:** -Adding `[[gnu::always_inline]]` (or `__attribute__((always_inline))`) to `gather_if`, -`simd_cast`, `simd_cast_if`, `where`, and `popcount` in Simd.h eliminates all 13 -`vzeroupper` transitions under GCC. Clang already inlines all but `popcount`; the -attribute is safe and a no-op for Clang. - -**`popcount` alternative — `vpshufb`-based nibble popcount:** -The current SWAR shift-and-add tree (88 instructions, §8e) avoids the scalar `popcnt` -instruction because it is not vectorisable into `VPOPCNTQ` on AVX2. There are two -other options worth considering: - -*Scalar `popcnt` with extract/reassemble:* `popcnt` is pipelined (Skylake+: 3-cycle -latency, 1/cycle throughput on port 1; 16 independent lanes retire in ~16 cycles). -The catch is the vector↔scalar domain crossing: extracting 16 uint64_t from 4 ymm -registers requires ~20 `vpextrq`/`vextracti128` instructions, and reassembly costs -another ~20 `vmovq`/`vpinsrq`/`vinserti128`. Total ≈ 56 instructions — fewer than -SWAR, but the bypass latency penalty (~2 cycles per ymm→GPR crossing on Skylake) -reduces the advantage, and port 1 serialises all 16 `popcnt`s. - -*`vpshufb`-based nibble popcount (recommended):* Stays entirely in vector registers, -no domain crossing, and shrinks the body to ≈ 40 instructions: - -``` -lo = v & 0x0F0F0F0F0F0F0F0F (vpand) -hi = (v >> 4) & 0x0F0F0F0F0F0F0F0F (vpsrlq + vpand) -bpop = vpshufb(lut, lo) + vpshufb(lut, hi) (2× vpshufb + vpaddq) -sum = vpsadbw(bpop, zero) (horizontal byte-sum → 64-bit lane result) -``` - -`vpshufb` and `vpsadbw` use ports 0/5 and port 5 respectively — orthogonal to the -arithmetic-heavy SWAR ports — so the `vpshufb` path is also more friendly to -out-of-order overlap with surrounding code. This is the standard compiler-generated -AVX2 popcount pattern and the likely replacement for `popcount64` in `Simd.h`. - -### 8g. Cycle budget and architectural comparison - -> **Revision note (see §8j).** The cycle-budget table below models the -> *historical* fully-SIMD `cachedGetValue` (§8e) and predicts a ~55-cycle -> critical path dominated by the gather chain. That pipeline no longer -> ships (hybrid refactor, §8i), and even for the scalar-tail path PMU -> measurement shows that the dominant cost is **not** gather/pointer-chase -> latency but rather **`valueMask.isOn(offset)` branch mispredicts** -> (§8j). The "4–10× CPU speedup over scalar" framing below remains -> directionally correct (the hybrid does still beat Legacy), but the -> magnitude is ~1.05× on 32-thread WENO5, not 4×. Use the §8j matrix as -> the authoritative measurement; treat this section as design-rationale -> history. - -#### `cachedGetValue` critical path (Clang 18 + stdx + `-march=native`, W=16) - -| Step | Work | Cumulative cycles | -|------|------|------------------:| -| 1 | SWAR expansion + base-32 multiply → `d_vec` | ~8 | -| 2 | 4× `vpgatherdd` → `tapLeafID_u32` | ~20 | -| 3 | `simd_cast_if` + ×kStride → `tapLeafOffset_i64` | ~25 | -| 4a/4b/5 | 4+4+4 `vpgatherqq` (3 independent groups, overlap in OoO) | ~41 | -| 6–8 | bitwise `dest_yz`, `maskWords & voxelBit`, popcount SWAR + `where` | **~55** | - -Critical path per call: **~55 cycles** (gather-chain limited; Steps 4a/4b/5 are the -deepest dependency). - -Single-core throughput reality: each call is ~600 instructions. Arrow Lake's ROB -(~500 entries) holds less than one full call, so call-to-call OoO overlap is minimal. -Realistic single-core cost is **~80–100 cy/call**, not the ~7 cy/call that perfect 8× -OoO would imply. For 128 elements × 18 taps = 144 calls: **~12,000–14,000 cycles -single-threaded**, or **~100 cy/element**. - -#### Comparison with scalar NanoVDB `getValue(ijk)` - -Naive alternative: 128 voxels × 18 taps = 2304 scalar `ReadAccessor::getValue()` calls. - -| Accessor L0 cache behaviour | cy/call | 2304 calls | cy/element | -|-----------------------------|--------:|-----------:|-----------:| -| Hit (same leaf as last call) | ~22 | ~51,000 | ~400 | -| Miss, tree nodes L1-warm | ~52 | ~120,000 | ~940 | -| Miss, tree nodes cold | ~100+ | ~230,000 | ~1800 | - -**BatchAccessor speedup: 4–10× depending on hit rate.** - -The two sources of gain: - -1. **Amortised tree traversal (dominant).** `prefetch` calls `probeLeaf` at most once - per direction per center-leaf switch — **12 calls** for a 128-element block (6 - directions × 2 center-leaf switches) vs. up to 2304 traversals for the scalar path. - Each saved traversal is ~25–35 cycles of pointer-chasing through root → internal → - internal → leaf with warm L1 nodes. - -2. **SIMD × 16.** The SWAR expansion, gather chain, and popcount all execute once for - 16 lanes simultaneously. Even if the scalar accessor hit perfectly on every call, - the SIMD path still wins by ~4× on arithmetic work alone. - -The scalar hit rate depends on loop ordering. Processing all 18 taps for one voxel -before moving to the next evicts the cached leaf on nearly every tap switch (high miss -rate). Sweeping all 128 voxels for one tap at a time improves hit rate, but requires -18 passes over the voxel array and hurts reuse of stencil results. - -#### CPU vs GPU: why the same operation inverts - -On CPU (8 P-cores), the 128-element block is **compute-bound**: - -- Index computation: ~12,000 cy per core -- Value fetch (512 unique floats, 32 cache lines, 8 cores competing for DDR5-5600): - ~80–664 cycles depending on cache level and core count -- System DRAM bandwidth consumed at full parallelism: ~4.6 GB/s out of 89 GB/s - available (~5% utilisation) - -The gather chain latency is the bottleneck; bandwidth sits largely idle. The CPU -BatchAccessor design (SIMD W=16, hardware `vpgatherqq`) directly attacks this by -compressing 16 serial gather chains into one parallel 55-cycle critical path. - -On GPU the same operation becomes **bandwidth-bound**: - -- An SM has hundreds of warps in flight. When a warp stalls on a gather or arithmetic - latency (~20–100+ cycles), the scheduler switches to another ready warp instantly. - The entire index computation — SWAR, base-32 multiply, all gather latencies — is - absorbed by warp switching. Effective compute cost per thread: ~0 stall cycles. -- What remains visible to the GPU is the **global memory traffic**: fetching stencil - float values. With hundreds of SMs each issuing many transactions simultaneously, - HBM bandwidth saturates quickly. -- GPU gathers are scalar-per-thread: 32 threads in a warp each doing an 8-byte load = - 32 independent transactions. Non-contiguous addresses (stencil neighbours across - leaves) yield uncoalesced access, amplifying bandwidth pressure. - -Consequently, GPU optimisation for this workload targets **coalescing** (adjacent -threads access adjacent values) and **cache footprint** (keeping the neighbour-leaf -working set in L1/shared memory), rather than the gather-chain depth that dominates -on CPU. - -### 8h. End-to-end perf: outlining, `[[gnu::flatten]]`, and W=8 - -> **Revision note (see §8j).** The end-to-end measurements and `[[gnu::flatten]]` -> findings in this section are correct. The *attribution* of cross-leaf cost to -> "multi-leaf L1 pressure" — which appeared here and in the original analysis — -> was **wrong**. `perf` counter measurements later showed that L1 miss rates are -> flat across all variants (~0.4 %) and that the dominant cross-leaf cost is -> actually **branch-mispredict stalls on the `valueMask.isOn(offset)` check** -> inside `LeafNode::getValue(offset)`. See §8j for the full -> perf-counter investigation and revised decomposition. - -§8f measured `cachedGetValue` as a standalone symbol. This section measures the -**full WENO5 pipeline end-to-end** — `StencilAccessor::moveTo` driving 18 taps × -128 voxels/block × 131072 blocks across 32 TBB threads — and reveals a much -larger GCC pathology that a single-function measurement cannot see. - -Workload: `ex_stencil_gather_cpu 33554432 0.5` (16 M active voxels, 50% occupancy, -i9-285K Arrow Lake, 32 threads, `-O3 -march=native`). Time is wall clock via -`nanovdb::util::Timer`; checksum-matches `LegacyStencilAccessor` in every run. - -#### End-to-end latency (ns/voxel, smaller is better) - -| Variant | GCC 13 | Clang 18 | -|--------------------------------------|-------:|---------:| -| No `flatten` | 7.5 | 4.3 | -| `flatten` on `BatchAccessor::{prefetch,cachedGetValue}` | 4.9 | 4.3 | -| `flatten` on `StencilAccessor::moveTo` (full transitive) | **3.7** | 4.3 | -| `LegacyStencilAccessor` reference | 5.4 | 6.7 | - -Without `flatten`, GCC's SIMD `StencilAccessor` is **39% slower than the scalar -`ReadAccessor`-based `LegacyStencilAccessor`** — the SIMD abstraction turns into a -net loss. With `[[gnu::flatten]]` on `moveTo`, GCC becomes 33% faster than scalar -and edges out Clang. - -#### Per-batch call accounting (GCC, W=16) - -`moveTo` processes 16 voxels per batch. Per-batch call count is the product of: - -| Call site | No flatten | moveTo flatten | -|-------------------------------------|-----------:|---------------:| -| `moveTo` → `prefetchHull`, `calcTaps` | 3 | inlined | -| `prefetchHull` internals | 12 | inlined | -| `calcTaps` → 18× `cachedGetValue` + 18× `WhereExpression::op=` | 37 | inlined | -| Inside each `cachedGetValue`: 14 outlined Simd.h helpers × 18 | 252 | inlined | -| Stack-canary / misc | 19 | 0 | -| **Total calls per batch** | **~323** | **0** | -| **Total `vzeroupper` per batch** | **~282** | **1** (epilogue) | - -At 16 voxels/batch, that is **~18 `vzeroupper` per voxel** without flatten. Each -VZU is cheap (~1–2 cycles) but serves as a strong ABI barrier that defeats the -out-of-order engine's ability to overlap pre- and post-call work. Combined with -the per-call argument marshaling of `_Fixed<16>` aggregates (128 B by reference), -the accumulated cost is the full 3.2 ns/voxel gap between the two variants. - -#### Why outlining happens under GCC - -Each Simd.h helper (`gather`, `gather_if`, `simd_cast`, `simd_cast_if`, `where`, -`popcount`, `WhereExpression::op=`) is an `inline` template. With `-O3`, GCC's -inliner decides each is "too expensive to inline" once the caller -(`cachedGetValue`, ~900 B) reaches a growth-budget threshold. It emits each -helper as a weak COMDAT and calls it. Every such call takes `_Fixed<16>` -aggregates by reference (the parameter doesn't fit in YMM), triggering -`vzeroupper` on entry. - -The same pattern propagates up: `calcTaps` (after inlining) is too big to accept -18 copies of `cachedGetValue`, so GCC outlines those too — one weak symbol per -template instantiation. Then `StencilAccessor::moveTo` calls `calcTaps` and -`prefetchHull` across that same boundary. - -Clang's inliner makes different decisions — it inlines the Simd.h helpers into -each `cachedGetValue`, keeps `cachedGetValue` outlined per-tap, and accepts the -18 calls from `calcTaps`. Clang also emits hardware gathers under `-march=native` -(16 `vpgather` per tap, see §8f), amortising the per-call cost with faster -gather semantics. - -#### Why `[[gnu::flatten]]` on `moveTo` wins - -`__attribute__((flatten))` forces **every call** in the annotated function's body -to be inlined, recursively — overriding all cost heuristics. Applied to -`StencilAccessor::moveTo`, it collapses the entire call tree (`prefetchHull`, -`calcTaps`, 18× `cachedGetValue`, 14× helpers per tap) into one monolithic -inlined body. Observed: **0 calls, 1 `vzeroupper` (function epilogue only), -14 350 insns, 77 KB of text in a single symbol**. - -Trade-offs: - -- Binary size: one 77 KB function per `StencilAccessor` instantiation. L1i is - 32 KB, but the per-batch hot path only sweeps a small fraction of the body - linearly, so I-cache pressure is manageable. -- Debuggability: one giant symbol to step through vs 40+ small symbols. -- Compile time: GCC spends notably longer compiling a flattened `moveTo`. - -#### Why `flatten` on `BatchAccessor::prefetch`/`cachedGetValue` alone is insufficient - -Flattening at the BatchAccessor level inlines the 14 Simd.h helpers into each -`cachedGetValue`/`prefetch` body (so each of those becomes a clean, self-contained -~800-insn function with ≤2 residual calls — typically `WhereExpression::op=` and -the `_S_generator` stdx lambda for `popcount`). However it leaves the 18 -`cachedGetValue` call sites *themselves* outlined — `calcTaps` still pays 38 -calls and 26 `vzeroupper` per batch. Measured: 4.9 ns/voxel — halfway between -no-flatten and full-flatten. - -The signal is clear: the *outer* `moveTo` → `calcTaps` → per-tap call boundary -is the dominant cost, not the inner helper-call boundary. - -#### W=8 experiment (batch-width halving) - -Motivation: halving the batch width reduces register pressure and spill volume, -and shifts some types from `_Fixed` to `_VecBuiltin<32>` (the native -`__m256i` ABI). Specifically at W=8: - -- `Simd` — 16 B, `_VecBuiltin<16>` (native XMM) -- `Simd` — 32 B, `_VecBuiltin<32>` (native YMM) ✓ register-passable -- `Simd` — 64 B, still `_Fixed<8>` (2× YMM aggregate, not passable) -- `Simd` — same as uint64 - -Only the `uint32_t` leaf-ID/mask vectors become register-passable; the dominant -`uint64_t` index vectors are still aggregate (half the size of the W=16 -aggregate, but still stack-passed). - -Measured at W=8 with full flatten: - -| Metric | W=16 | W=8 | Δ | -|-------------------------|--------:|--------:|--------:| -| `moveTo` text size | 77 KB | 34 KB | −56% | -| `moveTo` insns | 14,349 | 7,182 | −50% | -| YMM spill stores | 469 | 67 | **−86%**| -| YMM spill loads | 351 | 167 | −52% | -| vpinsrq (software-gather glue) | 432 | 216 | −50% | -| `vpgather*` | 0 | 0 | unchanged | -| `vzeroupper` | 1 | 1 | unchanged | -| **End-to-end (GCC)** | **3.7 ns/vox** | 4.2 ns/vox | +0.5 | -| **End-to-end (Clang)** | 4.3 ns/vox | 4.0 ns/vox | −0.3 | - -W=8 dramatically reduces register pressure (the spill count is 86% lower). But -GCC's end-to-end time regresses by 0.5 ns/voxel because the per-batch framing -cost (`zeroIndices`, `leafSlice == centerLeafID` mask compute, straddling -loop control, `prefetchHull`) is now amortised across only 8 lanes instead of -16. The body of `moveTo` halved; the surrounding scaffolding doubled. - -Clang benefits slightly (−0.3 ns/voxel), likely because its outlined -`cachedGetValue` was paying more call-frame marshaling at W=16 (4× YMM aggregate -vs 2× YMM at W=8). - -**Takeaway for future design**: W=8 would become attractive if the per-batch -framing work can be amortised across multiple adjacent batches — for example, -hoisting `prefetchHull` outside the batch loop for cases where the hull mask -is invariant across several batches of the same center-leaf. - -#### Findings - -**F1 — GCC's default codegen for this abstraction is broken.** Without -`flatten` or equivalent attributes, GCC emits ~323 calls / ~282 `vzeroupper` -per 16-voxel batch, making the SIMD `StencilAccessor` *slower* than the scalar -`LegacyStencilAccessor`. - -**F2 — `[[gnu::flatten]]` on `StencilAccessor::moveTo` restores performance.** -One attribute, targeting the WENO5 pipeline entry point, drops GCC from 7.5 to -3.7 ns/voxel (2×) and makes GCC the fastest of the measured configurations. - -**F3 — Partial flattening at `BatchAccessor::{prefetch,cachedGetValue}` is not -enough.** The inner helper calls are eliminated but the 18 `cachedGetValue` -call sites themselves remain — 4.9 ns/voxel. - -**F4 — Hardware gathers are not needed on Arrow Lake.** GCC emits 0 `vpgather` -in all variants; Clang+native emits 16 per `cachedGetValue`. GCC's -software-gather path (scalar loads + `vpinsrq`) nevertheless beats Clang's -hardware-gather path end-to-end (3.7 vs 4.3 ns/voxel) because the three load -ports issue the scalar gathers in parallel and the out-of-order engine hides -the latency. §8f Finding 5 (unmasked-gather auto-vectorisation) remains -correct; it is simply not load-bearing on this microarchitecture. - -**F5 — W=8 reduces spills dramatically but does not help end-to-end on GCC.** -Per-batch framing cost dominates at smaller widths. - -**F6 — Clang's performance is relatively insensitive to these knobs.** -Clang inlines the Simd.h helpers regardless of `flatten`, and its outlined -`cachedGetValue` pays only moderate call overhead. Both 4.0–4.3 ns/voxel -across all variants tested. - -**Not applied.** The codebase does not ship `[[gnu::flatten]]` by default. -StencilAccessor-style callers that require peak GCC performance may apply it -to their own hot entry point; the attribute is safe and a no-op under Clang. -This choice keeps the library's default codegen predictable and avoids forcing -a 77 KB monolithic body on callers with smaller working sets. - -### 8i. Hybrid SIMD → scalar-tail design *(current)* - -> **Revision note (see §8j).** The hybrid design and the perf-matrix numbers -> in this section are correct. Two *claims* in the "Cost of the refactor" / -> "Cleanup" subsections were subsequently refined: -> -> 1. The cross-leaf overhead (`Stencil − InLeaf ≈ 0.9 ns/voxel`) was attributed -> here to "multi-leaf L1 pressure". `perf` showed L1 miss rates are flat; -> the real source is additional unpredictable branches in the cross-leaf -> path. -> 2. The architectural claim that the 27-leaf neighbor cache eliminates full -> tree walks (§8i "Not applied" discussion) is correct structurally, but the -> *magnitude* of that savings is much smaller than implied. Measured via -> controlled decomposition: ~**0.3 ns/voxel** — about 6 % of Legacy's total -> 5.4 ns/voxel — not the majority of the "4.4 ns/voxel cross-leaf cost" this -> section's table implies. See §8j for the quantified breakdown. -> -> The hybrid design itself remains the right shipped choice; the refactor's -> primary win is Simd-free public API and compiler-portable performance, -> not the cache lookup. - -The findings of §8f/§8h motivated a different trade-off, which is what the -codebase now ships. - -**Where SIMD genuinely helps** (kept as SIMD): -- `prefetch()` — SWAR direction extraction over - `Simd` (32 B = one native `__m256i`), horizontal carry-bit - reductions, mask-bit identification of unique neighbor directions. - Amortizes the `probeLeaf` call over all 16 lanes and over every tap that - reaches the same direction. -- The *setup* half of `cachedGetValue`: SWAR expansion, `packed_sum`, base-32 - direction extraction (`d_u16`), and local-offset extraction - (`localOffset_u16`) from the packed layout. All of this is pure uint16 - SIMD arithmetic on a single `__m256i` — no aggregate ABI, no gathers, no - heterogeneous where-blends, no Simd.h helpers that GCC outlines. - -**Where SIMD was dragging us down** (now scalar): -- The gather chain (Steps 2–8 of §8e): 14 Simd.h helper calls per - `cachedGetValue` instantiation, operating on `_Fixed<16>` aggregates. This - is what produces 282 `vzeroupper` per batch on GCC without `flatten` (§8h). -- Scalar equivalents of the arithmetic (single `popcnt`, couple of scalar - loads from the target leaf, one `uint64_t` add) measure at **0.05 ns/tap** - when 18 taps × 16 lanes overlap freely on the load ports (§8 Legacy - decomposition — it's what `leaf.getValue(offset)` does internally anyway). - -**The boundary**: right after `d_u16` / `localOffset_u16` are computed. Two -`util::store` calls harvest them into stack `uint16_t[W]` C arrays; a -`util::to_bitmask` harvests the SIMD `leafMask` into a `uint32_t` bitmask. -The scalar tail is a one-liner per lane: - -```cpp -const uint32_t leafID = mNeighborLeafIDs[neighborIdx[lane]]; -if (leafID == kNullLeafID) { dst[lane] = 0; continue; } -dst[lane] = mFirstLeaf[leafID].getValue(localOffset[lane]); -``` - -**API change**: `cachedGetValue`'s output parameter is now -`ScalarValueT (&dst)[LaneWidth]` — a plain C array, one entry per lane — -instead of the old `Simd&` aggregate. Scalar lane writes -are a single `mov` with no mask round-trip, which is what eliminates the -18× `WhereExpression::operator=` outlined symbol. - -**StencilAccessor changes** (StencilAccessor.md §8.1): -- Storage: `Simd mIndices[SIZE]` → `uint64_t mIndices[SIZE][W]`, - made **public** (there's no work hidden behind the access). -- Return type of `moveTo()`: `SimdMask` → `void` (active-lane - information is `leafIndex[i] != UnusedLeafIndex`, already available to - the caller). -- Removed `getValue()` and `operator[]`; added - `static constexpr tapIndex()` for reorder-safe compile-time - named-tap access. - -**Public API of `StencilAccessor`**: zero `Simd<>` or `SimdMask<>` types. -Callers may SIMD-load tap rows from `mIndices[k]` with their own preferred -backend (`Simd::load(mIndices[k], element_aligned)`) or iterate -scalarly — we don't impose a choice. - -#### Perf comparison (same workload as §8h: 32 M ambient / 50% / 32 threads) - -| Variant | GCC 13 ns/vox | Clang 18 ns/vox | -|----------------------------------|--------------:|----------------:| -| Old SIMD path, no flatten | 7.5 | 4.3 | -| Old SIMD path, +flatten on moveTo| 3.7 | 4.3 | -| **Hybrid (current), no flatten** | **5.1** | **4.9** | -| Hybrid +flatten on moveTo | 4.8 | 4.8 | -| `LegacyStencilAccessor` | 5.5 | 6.7 | - -Without `flatten`, the hybrid is **31% faster than the old SIMD path on GCC** -(7.5 → 5.1) and beats scalar Legacy on both compilers. Compiler-sensitivity -collapses: GCC and Clang deliver within 0.2 ns/voxel of each other, -eliminating the 3× spread that §8f / §8h documented. - -The 4.8 ns/voxel asymptote with `flatten` on both compilers is consistent -with the scalar `popcnt` throughput bound (288 `popcnt/batch` ÷ 1 port ÷ -5 GHz = 57 ns/batch ÷ 16 voxels = 3.6 ns/voxel just for `popcnt`, plus -~1.2 ns/voxel of surrounding work). - -#### Cost of the refactor - -- GCC loses 1.4 ns/voxel vs the best previous configuration (SIMD + - `flatten(moveTo)` at 3.7 ns/vox). The SIMD popcount SWAR tree did real - work that scalar `popcnt` can't fully replace on port-1 throughput. -- Clang loses ~0.6 ns/voxel vs its previous 4.3 ns/vox. -- Both gains are recoverable by re-enabling `flatten` at the caller's - `moveTo` site (4.8 ns/vox on both compilers) — the shipped code just - doesn't require it by default. - -#### Cleanup of `Simd.h` - -With the gather chain gone, several helpers are no longer exercised by -`BatchAccessor`: -- `util::gather` / `util::gather_if` -- `util::simd_cast` for widening `u16 → i32`, `i32 → i64`, `u16 → u64` -- `util::simd_cast_if` -- `util::popcount` (vector SWAR) — replaced by scalar `leaf.getValue`'s - internal `popcnt` -- `util::WhereExpression` (heterogeneous form) - -These can be removed from `Simd.h` in a follow-up, subject to no external -caller using them. Added to support the hybrid: `util::store(v, p)` (a -uniform `store` shim that dispatches to `copy_to` on stdx and `store` on -the array backend). - -### 8j. `perf`-counter investigation — what actually bottlenecks the CPU path - -This section records the results of a direct PMU-counter investigation that -replaced several rounds of structural reasoning and cycle-budget estimation -(§8e–§8i) with measurements. **It revises or refutes several earlier claims** -and identifies the single biggest lever for CPU-side speedup of any -`ValueOnIndex` stencil gather. - -#### 8j.1 Motivation - -By §8i we had three working hypotheses for where the ~5.4 ns/voxel of Legacy -(and ~5.1 ns/voxel of the hybrid) was spent: - -1. **Tree-walk pointer chases** on leaf-cache misses (~25 % of taps cross - leaves in WENO5). -2. **L1 pressure** from touching up to 6 neighbour leaves' `mValueMask` / - `mPrefixSum` data per voxel. -3. **Gather-chain latency** in the old SIMD pipeline (largely mitigated by - the hybrid refactor — §8i). - -All three were structural guesses, anchored by the cycle-budget table in §8g -and by assembly reading. None had been validated with hardware counters. - -#### 8j.2 Methodology - -Added two CLI knobs to `ex_stencil_gather_cpu`: - -- `--pass=` — runs exactly one of the timed variants - (`framing`, `decode`, `center-hit`, `legacy`, `legacy-branchless`, - `degenerate`, `inleaf`, `stencil`). Needed because the default harness - runs every variant back-to-back, and `perf stat` cannot attribute counters - to a subrange. -- `--threads=` — gates TBB parallelism via `tbb::global_control`. Needed - because `perf` event multiplexing and hybrid-CPU attribution is cleaner - single-threaded on a single P-core. - -Setup: i9-285K Arrow Lake (8 P-cores + 16 E-cores, no HT). Pin to -`taskset -c 0` for the P-core. Lower `kernel.perf_event_paranoid` to 1. -Baseline events: `cycles, instructions, branch-instructions, branch-misses, -L1-dcache-loads, L1-dcache-load-misses`. Workload: 32 M ambient voxels / -50 % occupancy (16.7 M active). Build: GCC 13.3 at `-O3 -march=native` -with `NANOVDB_USE_INTRINSICS=ON` (though see §8j.7 for why this flag is a -no-op on this toolchain). - -#### 8j.3 Measurement matrix (single P-core, `--threads=1`) - -| Variant | ns/voxel | IPC | branch-miss | L1 miss | branch-misses / voxel | -|---------|---------:|----:|------------:|--------:|----------------------:| -| framing (no accessor call) | 3.2 | 2.52 | 3.15 % | 1.41 % | 2.05 | -| center-hit × 18 (legacy, same leaf, 18 distinct coords) | 19.0 | **4.80** | **0.84 %** | 0.47 % | 2.38 | -| Degenerate (hybrid, 18 × (0,0,0) — compiler CSE'd) | 29.0 | **4.02** | **0.75 %** | 0.41 % | 2.22 | -| InLeaf (hybrid, 18 distinct same-leaf, no CSE) | 76.6 | **1.45** | **9.87 %** | 0.68 % | 23.1 | -| Stencil (hybrid, WENO5 cross-leaf) | 96.9 | **1.53** | **8.75 %** | 0.46 % | 24.1 | -| Legacy (WENO5, 1-slot path cache) | 99.2 | **1.98** | **8.85 %** | 0.40 % | 26.7 | - -Three immediate observations from this matrix: - -1. **L1-dcache-load-misses is flat** across all six variants (0.40 – 0.68 %, - absolute counts 25.8 – 28.3 M). The multi-leaf L1 pressure hypothesis is - **falsified**. Even WENO5's 6-leaf working set stays L1-resident. -2. **Branch-miss rate splits cleanly into two groups**: "good" (0.75 – 0.84 %) - and "bad" (8.75 – 9.87 %). The split is not along tree-walk lines — - InLeaf has **no** tree walks (it is same-leaf by construction) yet lands - in the "bad" group with the highest miss rate of all. -3. **IPC collapses from ~4.5 to ~1.5** between the two groups. A backend - throughput difference of 3× is far too large to be attributable to any - single cache effect. - -#### 8j.4 Identifying the real source — the `valueMask.isOn(offset)` branch - -Every path that ends at `LeafNode::getValue(offset)` evaluates: - -```cpp -uint32_t n = i >> 6; -uint64_t w = mValueMask.words()[n], mask = 1ull << (i & 63u); -if (!(w & mask)) return 0; // ← unpredictable branch -uint64_t sum = mOffset + util::countOn(w & (mask - 1u)); -if (n--) sum += mPrefixSum >> (9u * n) & 511u; -return sum; -``` - -For our 50 %-occupancy workload, tap positions land on ON vs OFF bits with -roughly 60/40 frequency (spatially correlated but not perfectly). **This -branch is fundamentally unpredictable.** Its cost compounds: ~288 taps per -16-voxel batch × ~25 mispredicts per voxel × ~15-cycle mispredict penalty = -the dominant stall in both the hybrid and Legacy paths. - -Why do Degenerate and center-hit escape it? - -- **Degenerate**: 18 identical compile-time taps produce 18 identical values - per lane. GCC CSEs the entire per-lane computation (including the `isOn` - check) down to 1 evaluation + 18 stores of the same value. One branch per - lane survives instead of 18. -- **center-hit (legacy)**: after the tight loop is fully inlined, GCC emits - the `isOn`-guarded return as a **branchless `cmov`** pattern. Verified by - disassembly: no conditional jump in the hot path. This is not a general - property — it happens because `acc.getValue(coord)` in its minimal form - exposes a clean `?:`-equivalent to the compiler. In the hybrid's scalar - tail (larger function body, per-lane loop, harvest-buffer loads), GCC - keeps the `isOn` as a conditional jump. - -#### 8j.5 Branchless experiment — quantifying the `isOn` cost - -Added a `legacy-branchless` variant that replaces the `leaf.getValue(offset)` -call with the unconditional formula inlined at the call site: - -```cpp -// in place of `leaf.getValue(offset)` with isOn check: -const uint32_t offset = (c[0]&7)<<6 | (c[1]&7)<<3 | c[2]&7; -const uint32_t wordIdx = offset >> 6; -const uint64_t bit = 1ull << (offset & 63); -const uint64_t word = leaf->valueMask().words()[wordIdx]; -const uint64_t prefix = (wordIdx > 0) - ? (leaf->data()->mPrefixSum >> (9 * (wordIdx - 1))) & 511 - : 0; -s += leaf->data()->mOffset + prefix + __builtin_popcountll(word & (bit - 1)); -// No isOn check. Produces a non-zero "wrong" value for OFF voxels — -// so the checksum will NOT match — but wall-clock and PMU counters are clean. -``` - -Results: - -| Metric | Legacy (with `isOn`) | Legacy branchless | Δ | -|-----------------------|---------------------:|------------------:|------:| -| ns/voxel (32 thread) | 5.6 | **2.0** | −3.6 | -| ns/voxel (1 P-core) | 103.7 | **33.2** | −70.5 | -| IPC | 1.98 | **4.29** | 2.2× | -| branch-miss rate | 8.07 % | **1.67 %** | −5× | -| branch-misses / voxel | 27 | **4.6** | −6× | -| L1 miss rate | 0.36 % | 0.48 % | ~0 | -| instructions / voxel | 2646 | 2416 | −9 % | - -**The single change of removing the `isOn` branch recovers a 3× speedup on -Legacy end-to-end.** It accounts for the entire IPC collapse. The tree -walk inside `acc.probeLeaf()` is preserved in this variant, so the speedup -is not from avoiding tree walks — it is from removing the pipeline stalls -caused by mispredicting one branch per tap. - -#### 8j.6 Revised attribution of Legacy WENO5's 5.4 ns/voxel - -| Component | ns/voxel | How isolated | -|-----------------------------------------------|---------:|:-------------| -| Framing (decodeInverseMaps, loop, anti-DCE) | 0.25 | measured standalone | -| Leaf-local `getValue` work (loads + `popcnt`) | 0.75 | center-hit × 18 minus framing | -| `valueMask.isOn` branch mispredicts (~24/voxel × ~15 cy) | **~3.6** | Legacy minus Legacy-branchless | -| Full tree walk vs 27-leaf cache (stencil minus legacy) | **~0.3** | Stencil minus Legacy (or Legacy-branchless minus Stencil-branchless, if both existed) | -| Multi-leaf L1 pressure | ~0 | measured: L1 miss rate flat | -| **Total** | **~5.4** | | - -The earlier framing — that "tree walks and L1 pressure dominate" — was -wrong. Both turn out to be minor. The entire ~78 % of Legacy's cost that -§8h attributed to "cross-leaf overhead" is actually **~80 % `isOn` mispredicts, -~10 % real tree-walk work, ~10 % other**. - -#### 8j.7 `NANOVDB_USE_INTRINSICS` is a no-op on GCC 13 at `-O3 -march=native` - -`util::countOn(uint64_t)` in `nanovdb/util/Util.h` gates -`__builtin_popcountll` behind `NANOVDB_USE_INTRINSICS`; the fallback is a -SWAR popcount that uses a magic multiply (`0x0101010101010101`). Verified -by `objdump`: the compiled binary contains 178 `popcnt` instructions and -only 1 occurrence of the SWAR magic multiply. GCC's peephole pattern -matcher at `-O3` recognises the SWAR shape and replaces it with hardware -`popcnt` whether or not `NANOVDB_USE_INTRINSICS` is defined. This is -brittle (depends on GCC version, flags, and code layout); the macro should -be enabled explicitly in production builds for portability, but none of -the perf numbers in this section change when it is toggled. - -#### 8j.8 Architectural implications - -1. **BatchAccessor's 27-leaf cache addresses ~6 % of the total cost.** Its - architectural value over the scalar `DefaultReadAccessor`'s 1-slot cache - is real but modest on this workload. The neighbour cache eliminates the - full root-to-leaf traversal on every cross-leaf tap (§8i, confirmed - structurally), but the wall-clock saving is ~0.3 ns/voxel — dominated by - OoO pipelining of otherwise-serial pointer chases. - -2. **The biggest cheap CPU win available is branchless - `LeafNode::getValue(offset)` in NanoVDB proper.** Rewriting - that function (perhaps ~15 lines, preserving semantics for OFF voxels via - a branchless arithmetic gate) would give every stencil-gather caller — - Legacy, hybrid, any future variant — a 2–3× speedup - on CPU. Proposed form, sketched below, keeps OFF-returns-0 semantics: - - ```cpp - // sketch, not tested: - __hostdev__ uint64_t getValue(uint32_t i) const { - const uint32_t n = i >> 6; - const uint64_t w = mValueMask.words()[n]; - const uint64_t bit = 1ull << (i & 63u); - const uint64_t mask = bit - 1u; - const uint64_t on = (w & bit) ? ~0ull : 0ull; // cmov via explicit ternary - const uint64_t pfx = n ? ((mPrefixSum >> (9u * (n - 1u))) & 511u) : 0ull; - return on & (mOffset + pfx + util::countOn(w & mask)); - } - ``` - (The `on` gate pattern compiles to a `test`+`cmov` on GCC; the - `leaf.getValue` call pays one predictable branch instead of one - unpredictable one. Needs benchmarking to confirm the optimiser doesn't - refold it into a conditional jump.) - -3. **The hybrid `StencilAccessor`'s design rationale needs a small rewrite.** - The shipped hybrid design (§8i) is still the right API choice (Simd-free - public surface, compiler-portable perf) — but the justification is not - "it beats the gather chain's L1 pressure" (there is none); it is "it - matches the compiler's natural inlining / vectorisation model for this - workload and eliminates the outlining/vzeroupper pathology (§8h)." The - gain over Legacy WENO5 is marginal (~0.3 ns/voxel) because both pay the - same dominant `isOn` mispredict cost; the hybrid's real value emerges - only if and when `leaf.getValue` is made branchless. - -#### 8j.9 Historical correction log - -| Earlier claim | Source | Revised to | -|------------------------------------------------------------|:-----------|:-----------| -| "Tree-walk latency is the critical path" (cycle-budget) | §8g | OoO absorbs most of it; isOn mispredicts dominate. | -| "Multi-leaf L1 pressure accounts for ~0.9 ns/voxel cross-leaf overhead" | §8h, §8i | L1 miss rate is flat; the 0.9 ns/voxel is mostly isOn mispredicts shared with same-leaf InLeaf. | -| "Tree walks cost ~78 % of Legacy's time (4.4 ns/voxel)" | §8h (implicit); my thread claim | Real tree-walk cost is ~0.3 ns/voxel; the 4.4 ns/voxel was mostly isOn mispredicts. | -| "Degenerate ~1.7 ns/voxel is the hybrid's floor" | my thread claim | Degenerate is heavily CSE-biased; real floor is InLeaf at ~4.2 ns/voxel, of which ~3.5 is isOn mispredicts. | -| "`NANOVDB_USE_INTRINSICS` matters for popcount-heavy paths" | general assumption | No-op on GCC `-O3 -march=native`: SWAR → popcnt pattern match. Enable for portability anyway. | -| "27-leaf cache is the architectural win of BatchAccessor" | §8i "Cost of the refactor" | Cache delta is ~0.3 ns/voxel. Real wins are the Simd-free API and flatten-free compiler portability (§8i). | - -### 8k. Follow-up: branchless `LeafData::getValue`, narrow-band validation, and accessor cache-level - -Follow-on to §8j. Three things happened: -(1) the branchless reformulation of `leaf.getValue` was moved from a -hand-inlined benchmark hack into `NanoVDB.h` proper and made the default -body of `LeafData::getValue`, gated by -`NANOVDB_USE_BRANCHY_GETVALUE` for the legacy form; -(2) a second example (`ex_narrowband_stencil_cpu`) was added to validate -the finding on a real narrow-band level set rather than a pathological -random-occupancy synthetic; -(3) we noticed the scaffolding was using the default 3-level -`ReadAccessor` when only the leaf-level cache can -actually contribute, and switched to `ReadAccessor`. - -#### 8k.1 The API change: branchless `getValue` by default, toggle for the old form - -`LeafData::getValue` (NanoVDB.h:~4140) is now a -preprocessor-toggled pair: the branchless form is the default; defining -`NANOVDB_USE_BRANCHY_GETVALUE` at compile time restores the pre-2026 -branchy implementation. - -```cpp -__hostdev__ uint64_t getValue(uint32_t i) const -{ -#ifdef NANOVDB_USE_BRANCHY_GETVALUE - uint32_t n = i >> 6; - const uint64_t w = BaseT::mValueMask.words()[n], mask = uint64_t(1) << (i & 63u); - if (!(w & mask)) return uint64_t(0); - uint64_t sum = BaseT::mOffset + util::countOn(w & (mask - 1u)); - if (n--) sum += BaseT::mPrefixSum >> (9u * n) & 511u; - return sum; -#else - const uint32_t n = i >> 6; - const uint64_t w = BaseT::mValueMask.words()[n]; - const uint64_t bit = uint64_t(1) << (i & 63u); - const uint64_t prefix = n == 0u ? uint64_t(0) - : (BaseT::mPrefixSum >> (9u * (n - 1u))) & 511u; - const uint64_t sum = BaseT::mOffset + prefix + util::countOn(w & (bit - 1u)); - return ((w & bit) ? ~uint64_t(0) : uint64_t(0)) & sum; -#endif -} -``` - -Key design points: - -- **Default is branchless**, so every caller of - `leaf->getValue(offset)` / `leaf->data()->getValue(offset)` / - `ReadAccessor::getValue(ijk)` inherits the speedup with no code - change. The `NANOVDB_USE_BRANCHY_GETVALUE` macro restores the old - behaviour for bisection, regression testing, or performance - comparison. -- The `(w & bit) ? ~0ull : 0ull` ternary compiles to `test + cmov` on - x86 (verified on GCC 13 at `-O3 -march=native`), eliminating the - mispredict-prone conditional-jump pattern of the branchy form. -- The prefix-extract ternary (`n == 0u ? 0 : ...`) is kept as-is — its - outcome is 7:1 biased, so the predictor handles it cleanly, and the - shift would be UB on `n-1` if `n==0`. -- OFF voxels still return 0 (gated by the mask-AND), so the output is - bit-for-bit identical to the old `getValue`. **Checksum matches - byte-for-byte on all measured workloads.** -- Scoped to `LeafData` — the only build type where the - original early-return guard introduced a data-dependent branch. Other - `LeafData` specializations are unchanged. - -**API evolution note.** During the investigation the branchless form -was first committed as a sibling method `getValueBranchless` (8a24ddfd) -so callers could opt in explicitly. After benchmarking confirmed the -branchless form is strictly faster or within ~0.1 ns/vox on every -workload measured — including cases where the branch is highly -predictable — the sibling was folded into `getValue` as the default, and -the macro toggle was added so the pre-2026 form stays reachable by -explicit opt-in. Early commit messages in this branch may still -reference `getValueBranchless`; the surviving API is the single -toggleable `getValue`. - -#### 8k.2 `ex_narrowband_stencil_cpu` — realistic workload benchmark - -New example under `nanovdb/nanovdb/examples/ex_narrowband_stencil_cpu/`. -Structurally a clone of `ex_stencil_gather_cpu` (same `--pass=` / -`--threads=` CLI, same set of decomposition variants), but replaces -the procedural random-occupancy domain with `.vdb` file loading: - -- `openvdb::io::File(path).readGrid(name)` → `openvdb::FloatGrid` -- `nanovdb::tools::CreateNanoGrid(grid).getHandle< - ValueOnIndex, HostBuffer>(channels=0, ...)` → topology-only `NanoGrid` -- `builder.copyValues(sidecar.data())` → separately- - allocated `std::vector` sidecar (no blind-data residue in the - grid). Ordering sanity-checked at startup (1000 samples). - -The sidecar is plumbed through but not yet consumed by any stencil path -— placeholder for future "fetch values via the sidecar" work. - -Test input: `taperLER.vdb`, a ~129 MB narrow-band `UnsignedDistanceField` -FloatGrid with 31.8 M active voxels over a 1125×1081×762 bbox. - -#### 8k.3 Narrow-band vs synthetic measurement matrix - -Single P-core, `--threads=1`, PMU counters, `-O3 -march=native`: - -| Variant | Workload | ns/voxel | IPC | branch-miss | L1 miss | -|--------------------|-------------|---------:|-----:|------------:|--------:| -| branchy | narrow-band | 47.0 | 4.22 | 1.74 % | 0.06 % | -| branchless (default) | narrow-band | **34.5** | **5.55** | **0.45 %** | 0.07 % | -| branchy | synthetic | 106.1 | 1.96 | 8.07 % | 0.36 % | -| branchless (default) | synthetic | **37.9** | **4.55** | **1.63 %** | 0.39 % | - -Two observations that refine §8j: - -1. **Narrow-band is *not* pathological for branch prediction.** At 1.74 % - miss rate the branch predictor handles spatially-coherent traversals - well enough that the original `getValue` runs at IPC ~4.2 (near peak - for integer code). The isOn branch is only catastrophic when access - patterns are genuinely unpredictable; narrow-band SDF walks aren't. -2. **Branchless still wins on narrow-band** (47→34.5 ns/vox, - 1.4×) because the branch is still data-dependent even if mostly - predictable — every ~1 in 60 calls costs ~15 cycles. On synthetic - the benefit is much larger (2.8×) because there's a genuine - mispredict storm to eliminate. - -Per-call instruction count is within a handful of `getValue` in both -cases; L1 behaviour is identical. The speedup is entirely -branch-mispredict-pipeline-stall recovery. - -#### 8k.4 Accessor cache-level finding - -The `ReadAccessor` (`DefaultReadAccessor`) maintains -three cache slots (leaf, lower, upper). For `GetValue` workloads the -upper/lower slots are **never consulted** on a leaf-cache miss — -`ReadAccessor::get` falls straight through to `mRoot->getAndCache` -(NanoVDB.h:5387) — they're only written as passive side-effects of the -root-walk's `acc.insert(ijk, child)` calls at each level. - -Switching the scaffolding to `ReadAccessor` -(`LegacyStencilAccessor.h`, plus the `center-hit` / `legacy-branchless` -passes of both examples) removes those passive writes. Measured 32- -thread wall-clock deltas: - -| Workload, config | Legacy (branchy) | Legacy (branchless, default) | -|---------------------------|-----------------:|-----------------------------:| -| narrow-band, 8 P-cores | no change | 140.0 → 132.1 ms (−5.6 %) | -| narrow-band, 24 cores | no change | 66.1 → 60.3 ms (−8.8 %) | -| synthetic, 8 P-cores | no change | 80.8 → 76.8 ms (−5.0 %) | -| synthetic, 24 cores | no change | 35.8 → 34.3 ms (−4.2 %) | - -Legacy paths are backend-bound on mispredicts — the extra stores -overlap for free in the stall cycles. The branchless paths run at -near-peak IPC (~5.5) where there is no slack, so every retired -instruction shows up. Classic Amdahl corollary: the closer to peak, -the more every small thing matters. - -**Scope caveat** (for any future "should the library default change" -discussion): the 1-level accessor is strictly better only for -`GetValue`-only hot loops. `probeValue`, `probeLeaf`, and -`isActive`/`GetState` queries do traverse at levels ≥ 1 and benefit from -the upper/lower slots. `DefaultReadAccessor` is the right default for -mixed workloads; opt into 1-level only when you know the loop is -`GetValue`-exclusive. - -#### 8k.5 End-to-end headline numbers (updated) - -24-core Arrow Lake, full pipeline including decode: - -| Workload | branchy | branchless (default) | Speedup | -|-----------------------------------|--------:|---------------------:|--------:| -| Narrow-band taperLER (31.8 M) | 85 ms | **60 ms** | 1.4 × | -| Synthetic random 50% (16.7 M) | 95 ms | **34 ms** | 2.8 × | - -Speedup is thread-count-independent (same ratio across 8 P-cores and -24 cores). The two workloads' speedup *spread* — 1.4 × vs 2.8 × — -tracks exactly how unpredictable the isOn branch is for each pattern. - -#### 8k.6 What this updates in the §10 Remaining list - -The "Branchless `LeafNode::getValue`" item is complete -(shipped at the `LeafData` level per the scope decision, with benchmark -coverage on both synthetic and real narrow-band workloads). Future -follow-ons implied by this work but not pursued here: -- A `ProbeValue::get` refactor that reuses the already-computed - `(w & bit)` from `getValue` to eliminate the redundant second - `isOn` test at NanoVDB.h:6302–6306. -- Steering-team pitch for making `NANOVDB_USE_BRANCHY_GETVALUE` a - legacy compatibility shim (to be retired after a deprecation window) - rather than a permanent toggle. - -### 8l. Follow-up: tap-outer loop ordering in the Legacy path - -Tested whether flipping `legacy`'s loop nest to tap-outer, -voxel-inner helps on spatially-coherent workloads where many voxels in -a batch are likely to share the same `valueMask` word. Added a -`legacy-transposed` benchmark pass in both `ex_stencil_gather_cpu` and -`ex_narrowband_stencil_cpu`; checksums match `legacy` byte-for-byte on -both workloads. - -#### 8l.1 Inlining pitfall - -First attempt used a runtime-args inner lambda -`[&](int di, int dj, int dk) { for (int i = 0; i < 128; ++i) ... }` -invoked N_taps times via a parameter-pack fold (18 at the time of the -experiment — pre-center-tap — which is when these numbers were -collected; the same issue and fix apply at 19). GCC refused to -inline the instantiations — the lambda body contains a 128-iteration -loop with `probeLeaf` + `getValue` inside, which blew past the -per-caller inline budget × N_taps. Result: explicit `call` -instructions to a 542-byte `processTap` function with a 6-register -prologue/epilogue per call, and tap offsets `(di, dj, dk)` as runtime -register arguments (one spilled to stack) — so the compiler also -couldn't specialise the loop body per tap. That alone accounted for -~10 ms (~13 %) of the observed slowdown vs. Legacy. - -Fix: templated lambda -`[&]() [[gnu::always_inline]] { ... }` -dispatched via `.template operator()()` inside the fold. -The standalone `processTap` symbol disappears; transposed body grows -from 4.4 KB → 9.8 KB (matching Legacy's 10.5 KB), and only cold-path -tree-walk helpers remain as call targets. - -#### 8l.2 Results - -Measured at ~32M active voxel scale on i9-285K (24 threads, no HT): - -| Workload | Legacy (voxel-outer) | Transposed (tap-outer) | Δ | -|----------|---------------------:|-----------------------:|--:| -| Narrowband taperLER.vdb | 2.2 ns/vox | 2.1 ns/vox | −3 to −6 % (within noise) | -| Synthetic 64M/50% | 2.4 ns/vox | 2.8 ns/vox | +19 % | - -The narrowband tap-outer edge is marginal and within the ~10 % -run-to-run noise floor observed on this host. Synthetic's tap-outer -slowdown is clearly outside noise. Not a consistent win. - -#### 8l.3 Implementation verdict: voxel-outer stays the default - -`LegacyStencilAccessor`'s voxel-outer `moveTo(center)` kept as the -production default: - -- **Clean abstraction**: `moveTo(center)` + indexed tap access maps - 1:1 to the stencil operator's mental model. A tap-outer batched - form would need external accumulator state and a centers-array - input, with no natural class boundary. -- **No scratch arrays**: voxel-outer keeps the per-voxel accumulator - in a register and the 18-tap buffer inside the accessor; tap-outer - needs stack-local `centers[128]` and `s[128]`. -- **Compiler robustness**: voxel-outer's 18-call single source - location is reliably collapsed by GCC. Tap-outer relies on an - explicit `[[gnu::always_inline]]` workaround that, if lost during - future refactors, would silently regress performance by ~13 %. - -`legacy-transposed` retained as a benchmark pass for reference and as -a datapoint reinforcing why the hybrid `StencilAccessor` is structured -tap-outer (SIMD direction-computation inherently amortises across -lanes at the same tap). - ---- - -## 9. Relationship to Phase 1 Prototype - -`ex_stencil_gather_cpu` implements the core cache machinery as free functions. - -| Prototype component | `BatchAccessor` equivalent | -|--------------------|-----------------------------| -| `probedMask` + `ptrs[27]` locals | `mProbedMask` + `mLeafNeighbors[27]` members | -| `computeNeededDirs(expandedVec)` | per-lane loop inside `prefetch` | -| `kSentinelExpanded` broadcast | sentinel applied by caller before `prefetch` | -| `probeLeaf` loop (`toProbe` bits) | `while (toProbe)` inside `prefetch` | -| `batchPtrs[4][SIMDw]` population | replaced by `cachedGetValue` | -| `verifyBatchPtrs` | future: `cachedGetValue` unit test | - ---- - -## 10. Status and Future Work - -### Completed - -- `prefetch`: fully SIMD crossing detection, lazy probeLeaf. -- `cachedGetValue`: fully SIMD end-to-end (Steps 1–8, §8e). - Verified against scalar reference over 12M lane-checks across all 18 WENO5 taps. -- Class-level base pointers (`mOffsetBase`, `mPrefixBase`, `mMaskWordBase`). -- `simd_cast_if`, heterogeneous `gather_if`, `popcount64`/`popcount` added to `Simd.h`. -- Simd.h array-backend `Simd(const T*, element_aligned_tag)` load constructor: - removed default argument for the tag to eliminate the `Simd(0)` null-pointer-constant - ambiguity that breaks compilation under `-DNANOVDB_NO_STD_SIMD`. -- Full 7-variant codegen analysis (compiler × backend × ISA, §8f), including - before/after delta for the unmasked-gather change and `-mavx2 -mtune=native` - equivalence finding. -- **Unmasked gather (Steps 2/4a/4b):** `gather_if` replaced with `gather` using the - sentinel invariant (d ∈ [0,26]; invalid lanes read base[0]). Step 5 kept masked so - `maskWords=0` for invalid lanes → `isActive=false` without cross-width mask AND. - Verified: 12M lane-checks pass across all 18 WENO5 taps. Unlocks hardware - `vpgatherqq` in the array backend under Clang + native tuning. -- **End-to-end codegen analysis (§8h)**: measured the full WENO5 pipeline - (`StencilAccessor::moveTo` × 131 K blocks × 32 threads) on i9-285K Arrow Lake. - Established that GCC's default `-O3` outlines 14 Simd.h helpers per - `cachedGetValue` and outlines `cachedGetValue`/`WhereExpression::op=` per tap, - producing ~282 `vzeroupper` per 16-voxel batch and making the SIMD path - slower than scalar `LegacyStencilAccessor`. `[[gnu::flatten]]` on - `StencilAccessor::moveTo` collapses the full call tree and drops GCC from - 7.5 to 3.7 ns/voxel (2×), beating Clang's 4.3 ns/voxel. W=8 cuts spills by - 86% but regresses GCC end-to-end due to per-batch framing overhead. - Attributes **not applied** in the shipped code; see §8h "Not applied" note. - -- **Hybrid SIMD → scalar-tail refactor (§8i)**: shipped. `BatchAccessor::cachedGetValue` - now keeps the SIMD SWAR setup and harvests per-lane direction / local-offset - into C arrays for a plain scalar tail calling `leaf.getValue(offset)`. - Public API of `StencilAccessor` is Simd-free; performance is within - ~0.3 ns/voxel of the old flatten-forced path, but compiler-portable. - -- **PMU-counter investigation (§8j)**: validated the above empirically and - refuted two earlier working hypotheses. Specifically: - - L1 miss rate is flat across all variants (~0.4 %) — **multi-leaf L1 - pressure is not a factor**. - - The dominant cost (~65 % of Legacy's 5.4 ns/voxel) is branch-mispredict - stalls on the **`valueMask.isOn(offset)` check** inside - `LeafNode::getValue(offset)`. - - A branchless reformulation of that call recovers a 3× speedup - (5.6 → 2.0 ns/voxel on 32 threads) with IPC rising from 1.98 to 4.29. - - Tree-walk elimination by the 27-leaf cache saves ~0.3 ns/voxel, not - the ~3 – 4 ns/voxel implied by §8h/§8i. - - `NANOVDB_USE_INTRINSICS` is a no-op on GCC 13 at `-O3 -march=native` - (SWAR `util::countOn` is pattern-matched to hardware `popcnt`). Enable - it in the build anyway for portability. - -- **Branchless `LeafData::getValue` in `NanoVDB.h` (§8k)**: - shipped. The default body of `getValue` is now the branchless form - (`test+cmov` gate instead of a conditional jump); defining - `NANOVDB_USE_BRANCHY_GETVALUE` at compile time restores the pre-2026 - branchy version. Same semantics either way. Validated on both synthetic - random 50% (2.8× end-to-end speedup on 24 cores) and real narrow-band - `taperLER.vdb` (1.4× speedup). - -- **`ex_narrowband_stencil_cpu` (§8k.2)**: new `.vdb`-based benchmark - companion to `ex_stencil_gather_cpu`. Loads an openvdb `FloatGrid`, - converts to `ValueOnIndex` topology + separately-allocated float - sidecar, runs the same perf-decomposition battery on realistic - narrow-band workloads. - -- **Leaf-only `ReadAccessor` in benchmark scaffolding - (§8k.4)**: `LegacyStencilAccessor` and the `center-hit` / - `legacy-branchless` passes switched from `DefaultReadAccessor` (3-level - cache) to a 1-level leaf-only cache. Upper/lower slots are never - consulted for `GetValue` workloads; the switch removes passive - bookkeeping and gives 4–9 % additional speedup on branchless paths. - Scope: benchmark-only; the library default is unchanged (right default - for `probeValue`/`probeLeaf`/mixed workloads). - -- **Tap-outer loop ordering evaluation in the Legacy path (§8l)**: - added `legacy-transposed` benchmark pass (checksums match byte-for-byte) - and tested on both workloads at matched ~32M-voxel scale. Narrowband: - marginal tap-outer edge, within noise. Synthetic: tap-outer ~19 % - slower. Uncovered a GCC inlining pitfall for runtime-args inner lambdas - (fixed via templated lambda + `[[gnu::always_inline]]`). - **Verdict**: voxel-outer `LegacyStencilAccessor` remains the default — - cleaner abstraction, no scratch arrays, no compiler-inlining fragility, - and no consistent perf advantage to tap-outer. - -### Remaining - -- **`[[gnu::always_inline]]` on `Simd.h` helpers** (§8f) vs - **`[[gnu::flatten]]` on StencilAccessor-style entry points** (§8h): - two candidate approaches to restore GCC inlining. Mostly superseded - by the hybrid refactor (§8i) and the branchless-leaf opportunity - (§8j); leave open in case later callers reintroduce the outlining - pathology. - -- **`vpshufb`-based `popcount` in `Simd.h`:** replace `popcount64` SWAR tree with - nibble-LUT + `vpsadbw` pattern (§8f); reduces the out-of-line body from 88 to ≈40 - instructions and uses orthogonal execution ports. - -- **`getValue`:** lazy combined `prefetch` + `cachedGetValue`. - -- **Runtime `Coord` overload:** for generic stencil adapters iterating over an offset - list at runtime. - -- **`StencilAccessor`:** higher-level wrapper that owns the `while (any_of)` loop, - hides straddling from the caller, and fills complete stencil result arrays. - -- **Multi-leaf stencils (R > 4):** the single-neighbor-per-axis assumption in - `cachedGetValue` holds for R ≤ 4. Generalisation requires checking both lo and hi - neighbors per axis. - -- **C++20 structural `Coord`:** unify template and runtime interfaces via - `cachedGetValue(result, vo, leafMask)`. - ---- - -## 11. Target pipeline: per-block CPU WENO5 with sidecar values - -The work documented in §5–§8 — VBM decode, `BatchAccessor`, -`StencilAccessor`, branchless `LeafData::getValue`, the voxel-outer vs -tap-outer evaluation (§8l), the 19-tap `Weno5Stencil` alignment with -canonical `WenoPt<>` ordering — are all in service of a single target -end-to-end pipeline. For each VBM block the CPU WENO5 pass runs three -phases: - -### 11.1 Phase structure - -**(1) Decode inverse maps** — produce `leafIndex[128]` and -`voxelOffset[128]` from the block's `firstLeafID`, `jumpMap`, and -`firstOffset`. Already shipped; see §2. - -**(2) Per-batch sidecar value assembly** — for each W-wide batch within -the block (W = SIMD float lane width, typically 4 or 8), produce a -dense 2D array `float mValues[Ntaps][W]` that packs every tap's float -value for every active lane in the batch. This is where -`StencilAccessor` (hybrid SIMD cache + scalar tail) or -`LegacyStencilAccessor` (scalar per-voxel) plugs in — but the *output* -shape changes from the current `uint64_t mIndices[Ntaps][W]` to a -float array obtained via sidecar lookup (plus sign-extrapolation for -off-band taps; see §11.2). Per-batch scope is deliberate: `mValues` -stays resident in registers / L1 for the duration of the batch's WENO -arithmetic; a block-wide buffer would be 19 × 128 × 4 B ≈ 9.5 KB and -would spill L1 prematurely. - -**(3) Full SIMD WENO** — consume `mValues[tap][lane]` as -`Simd` loads (one SIMD register per tap) and evaluate the -WENO5 reconstruction via the generic-T Simd backend. The existing -Phase-2 GPU draft and the Simd.h infrastructure provide the -arithmetic; this phase is essentially `nanovdb::math::WENO5<>` applied -across W voxels simultaneously. - -### 11.2 Sidecar value assembly semantics - -For a tap at position *p = center + Δ*, the sidecar lookup is: - -``` -idx = leafPtr->getValue(localOffset(p)) // uint64_t, branchless -if (idx != 0) { - mValues[tap][lane] = sidecar[idx] -} else { - // out-of-band: voxel p is outside the narrow band - mValues[tap][lane] = sign_of_next_inner_tap * |sidecar[0]| -} -``` - -The "next-inner tap" is the tap one step closer to the center along -the *same* axis. This preserves a single-signed distance-field -interpretation across the band boundary: out-of-band voxels are -treated as "still outside on the same side as the near side," with -magnitude set to the background `|sidecar[0]|`. - -| Outer tap | Sign donor (next-inner along same axis) | -|-----------|-----------------------------------------| -| `<+2, 0, 0>`, `<0,+2, 0>`, `<0, 0,+2>` | `<+1, 0, 0>`, `<0,+1, 0>`, `<0, 0,+1>` | -| `<+3, 0, 0>`, `<0,+3, 0>`, `<0, 0,+3>` | `<+2, 0, 0>`, `<0,+2, 0>`, `<0, 0,+2>` | -| `<-2, 0, 0>`, `<0,-2, 0>`, `<0, 0,-2>` | `<-1, 0, 0>`, `<0,-1, 0>`, `<0, 0,-1>` | -| `<-3, 0, 0>`, `<0,-3, 0>`, `<0, 0,-3>` | `<-2, 0, 0>`, `<0,-2, 0>`, `<0, 0,-2>` | -| `<±1,0,0>`, `<0,±1,0>`, `<0,0,±1>` | *see §11.4 below* | - -### 11.3 Loop-order implications - -Two forces align to favor a tap-outer, voxel-inner assembly loop in -Phase 2: - -**(a) Output shape matches consumer.** `mValues[tap][lane]` is the -natural layout for `Simd::load(mValues[k], element_aligned)`. -A tap-outer assembly fills this directly. A voxel-outer assembly -would need either a transpose at the end or strided SIMD loads in -Phase 3. - -**(b) Sign-extrapolation dependency is tap-local.** If taps are -filled in axis-major / ascending-|Δ| order (e.g. for the x-axis: -first `<+1,0,0>`, then `<+2,0,0>`, then `<+3,0,0>`; similarly for -`−1,−2,−3` and for the y and z axes), the inner tap's float value is -already resident when the outer tap's sign-extrap check fires. -Voxel-outer also works but repeats the sign check per voxel rather -than once per (axis, |Δ|) pair. - -The §8l measurements (voxel-outer modestly beats tap-outer at -BlockWidth=128 inner-loop size over uint64 indices) do *not* settle -the Phase-2 loop-order question: at W=4 or W=8 inner-loop size, the -compiler-amortisation advantage of voxel-outer shrinks drastically -(only 4–8 voxels per unroll), while the output-layout-match benefit of -tap-outer becomes dominant. Re-running the ordering comparison at -the real pipeline's batch width is a required step before the -implementation choice is locked in. - -### 11.4 Open questions (to resolve before implementation) - -**(a) Sign source for distance-1 taps.** `<±1,0,0>`, `<0,±1,0>`, -`<0,0,±1>` have no inner tap along their axis except the center -`<0,0,0>`. Two possible rules: - -- *Uniform rule:* distance-1 inherits sign from the center tap's - float value. Always safe; one extra sign-check per distance-1 - tap. -- *Invariant-based rule:* distance-1 neighbours of any active voxel - are guaranteed in-band, so the sign-extrap branch never fires for - |Δ|=1 taps. Requires confirmation against how narrow-band layers - are generated upstream (openvdb level-set builders). - -The uniform rule is the default unless the invariant can be -confirmed and codified. - -**(b) Cascade behavior.** If the inner tap's value is *itself* the -result of a prior sign-extrapolation, using its sign directly is -correct by transitivity: when taps are processed in ascending-|Δ| -order along each axis, the inner tap's resolved float already carries -the correct sign (real or extrapolated), so the rule is -self-consistent without special-casing. Worth capturing here -because it's the quiet invariant that keeps the algorithm simple. - -### 11.5 Deliverables (not yet shipped) - -Implementation items that follow directly from §11.1–§11.4: - -- **Sidecar-aware `moveTo` variant** on `StencilAccessor` (and a - parallel form on `LegacyStencilAccessor`): same straddling + SIMD - cache structure as today, but writes `float mValues[SIZE][W]` via - sidecar lookup instead of `uint64_t mIndices[SIZE][W]`. -- **Sign-extrapolation pass** — either fused into the scalar tail (per - §11.3b), or as a post-pass that walks taps in axis-major, - ascending-|Δ| order over the filled `mValues`. -- **Phase-3 WENO kernel** — `nanovdb::math::WENO5>` - driven by the 19-slot `mValues` array, following the existing - `WenoStencil::WENO5` arithmetic but with all reads from the - pre-assembled batch buffer. -- **Batch-width ordering benchmark** — rerun the legacy/transposed - comparison at W=4 and W=8 over floats (not uint64 indices) to lock - in the Phase-2 loop order. - -### 11.6 Implementation status (Stage 1 / 2 / 3 landed) - -The target pipeline above is being landed in three incremental stages. -The first three are done; the WENO5 arithmetic + write-back phase is -the remaining gap before a full end-to-end advection step. - -#### Stage 1 — batch-structured outer loop - -`ex_stencil_gather_cpu` and `ex_narrowband_stencil_cpu`: the `legacy`, -`framing`, and `legacy-transposed` passes were restructured from a -flat `for i in 0..BlockWidth` loop into a nested -`for batchStart in 0..BlockWidth step SIMDw; for i in 0..SIMDw` loop. -The `stencil` pass was already batch-structured and served as the -reference shape. Rationale: every downstream phase (sidecar fetch, -extrapolate, WENO5 reconstruct) is SIMD-wide, so the outer loop has -to match that cadence first. - -Perf cost (narrowband, 24 threads): ~5% on legacy, ~7% on -legacy-transposed — the flat-128 loop previously gave GCC more -iterations over which to amortize loop overhead. Recovered in full -by subsequent stages. - -Commit: `5a920596`. - -#### Stage 2 — sidecar value assembly (uint64 → float via lookup) - -`ex_narrowband_stencil_cpu`: three new passes that assemble the -per-batch `float values[SIZE][SIMDw]` + `bool isActive[SIZE][SIMDw]` -matrices from the sidecar, plus a stand-in token op (sum of active -tap values per voxel, written to a second sidecar at the VBM-sequential -index). Variants: - -| Pass | Gather method | Time (ns/voxel) | -|------|---------------|----------------:| -| `sidecar-legacy` | LegacyStencilAccessor scalar moveTo | 4.1 | -| `sidecar-stencil` | StencilAccessor (hybrid SIMD + scalar tail) | **3.0** | -| `sidecar-transposed` | ReadAccessor tap-outer direct probeLeaf | 4.0 | - -All three produce identical output checksums (cross-validation). -StencilAccessor wins by ~25% over the scalar paths — its SIMD moveTo -amortises direction-decode + leaf-cache across the batch, and its -contiguous `mIndices[k][i]` row feeds a vector-friendly sidecar gather. - -Supporting change: `convertToIndexGridWithSidecar` now sets -`sidecar[0] = floatGrid.background()` so the sidecar fetch is -unconditional (no per-lane branch on `idx == 0`). - -Commit: `110d852c`. - -#### Stage 3 — out-of-band extrapolation via WenoStencil - -New header `nanovdb/nanovdb/util/WenoStencil.h` (design doc: -[WenoStencil.md](WenoStencil.md)) defines `WenoStencil`: a 19-tap -value + activity container templated on SIMD lane width, with a -single-source scalar/SIMD `extrapolate(absBackground)` method. The -extrapolation implements the cascade from §11.2: out-of-band lanes -take `copysign(|background|, mValues[innerTap][lane])`, processed in -ascending-|Δ| order so the inner tap is always already resolved. - -Integration: new `sidecar-stencil-extrap` pass in -`ex_narrowband_stencil_cpu` reuses StencilAccessor for the gather, -fills a `WenoStencil`, calls `extrapolate()`, then sums all -19 taps unconditionally (no longer gated by `isActive`). - -Measured extrapolation overhead: **+4.5 ms / 31.8M voxels -= 0.14 ns/voxel** end-to-end on taperLER.vdb (24 threads, -i9-285K) — 18 SIMD blend pairs per batch, ~126 cycles per 16-voxel -batch, ~8 cycles/voxel per core. - -| Pass | ns/voxel | Checksum | -|------|---------:|----------| -| `sidecar-stencil` | 3.1 | `0xcfbff7c8` | -| `sidecar-stencil-extrap` | 3.2 | `0x371273d0` | - -Checksums differ as expected: the extrap variant sums -`mValues[k]` for all 19 taps after extrapolation, whereas -`sidecar-stencil` gates the sum by `isActive[k]` and so excludes -out-of-band lanes. - -Commit: `a6b08712`. - -#### Remaining (Stage 4+) - -The §11.5 "deliverables" list is now reduced to two items: - -- **Phase-3 WENO5 kernel** — `reconstruct()` method on - `WenoStencil` (or a free function consuming it) producing per-axis - fluxes via the Weno5 arithmetic, single-source across W. Sketched - in WenoStencil.md §7.1. -- **Batch-width ordering benchmark** — at W=4 / W=8 over floats, - to validate that the tap-outer fill hypothesis (§11.3) holds at - the real Phase-3 inner-loop size before locking the shape. diff --git a/nanovdb/nanovdb/util/LegacyStencilAccessor.h b/nanovdb/nanovdb/util/LegacyStencilAccessor.h deleted file mode 100644 index 377267fa48..0000000000 --- a/nanovdb/nanovdb/util/LegacyStencilAccessor.h +++ /dev/null @@ -1,127 +0,0 @@ -// Copyright Contributors to the OpenVDB Project -// SPDX-License-Identifier: Apache-2.0 - -/*! - \file LegacyStencilAccessor.h - - \brief Scalar stencil-index accessor using a NanoVDB ReadAccessor. - - LegacyStencilAccessor resolves each stencil point via a path-cached - NanoVDB ReadAccessor, one voxel at a time. It is templatized on a - StencilT policy class whose StencilPoints tuple defines the point offsets. - - This mirrors the approach of OpenVDB's math/Stencils.h: the accessor - caches the last-visited tree path so that consecutive points within the - same leaf are cheap, but distant points (e.g. WENO5 radius-3 offsets) - can evict the center-leaf path. - - Thread safety - ------------- - Each instance owns its ReadAccessor. Construct one per thread. - - Template parameters - ------------------- - BuildT NanoVDB build type (e.g. ValueOnIndex). - StencilT Policy class describing the stencil. Must expose: - using StencilPoints = std::tuple...>; - where each S is any type with static int members di, dj, dk - (e.g. WenoStencil::StencilPoint). -*/ - -#pragma once - -#include - -#include -#include -#include -#include // std::index_sequence, std::make_index_sequence - -namespace nanovdb { - -template -class LegacyStencilAccessor -{ - using GridT = NanoGrid; - - static constexpr int SIZE = int(std::tuple_size_v); - - // Compile-time inverse map: (i,j,k) -> slot index in - // StencilT::StencilPoints. Returns -1 if no matching point exists; - // getValue() turns that into a static_assert. Same shape as - // WenoStencil::findPoint (kept local here to avoid a cross-header - // dependency). - template - static constexpr int findPoint(std::index_sequence) - { - using StencilPoints = typename StencilT::StencilPoints; - int result = -1; - ((std::tuple_element_t::di == i && - std::tuple_element_t::dj == j && - std::tuple_element_t::dk == k && - result < 0 ? (result = int(Is)) : 0), ...); - return result; - } - -public: - // Leaf-only ReadAccessor (cache level 0 only). The DefaultReadAccessor - // (levels 0/1/2) caches upper and lower nodes too, but those slots are - // never consulted during a GetValue cache-miss resolution -- the fallback - // goes straight to mRoot->getAndCache(...). Using a 1-level accessor - // removes passive bookkeeping of the upper/lower slots on every miss and - // keeps the benchmark honest about what's being measured. - using AccessorT = ReadAccessor; - - explicit LegacyStencilAccessor(const GridT& grid) - : mAcc(grid.tree().root()) {} - - // ------------------------------------------------------------------------- - // moveTo -- resolve all SIZE stencil-point indices for the voxel at @a center. - // - // Calls ReadAccessor::getValue(center + offset) for each point in - // StencilT::StencilPoints. The path cache inside mAcc amortizes - // tree-traversal cost for nearby points, but distant points (e.g. WENO5 - // +/-3) may evict the center-leaf path. - // - // Results are valid until the next moveTo() call. - // ------------------------------------------------------------------------- - void moveTo(const Coord& center) - { - fillStencil(center, std::make_index_sequence{}); - } - - // ------------------------------------------------------------------------- - // operator[] -- indexed point access. i must be in [0, SIZE). - // ------------------------------------------------------------------------- - uint64_t operator[](int i) const { return mStencil[i]; } - - // ------------------------------------------------------------------------- - // getValue -- compile-time named point access. - // ------------------------------------------------------------------------- - template - uint64_t getValue() const - { - constexpr int I = findPoint(std::make_index_sequence{}); - static_assert(I >= 0, "LegacyStencilAccessor::getValue: point not in stencil"); - return mStencil[I]; - } - - static constexpr int size() { return SIZE; } - -private: - template - void fillStencil(const Coord& center, std::index_sequence) - { - using StencilPoints = typename StencilT::StencilPoints; - ((mStencil[Is] = static_cast( - mAcc.getValue(center + Coord( - std::tuple_element_t::di, - std::tuple_element_t::dj, - std::tuple_element_t::dk)))), ...); - } - - AccessorT mAcc; - uint64_t mStencil[SIZE]; -}; - -} // namespace nanovdb diff --git a/nanovdb/nanovdb/util/StencilAccessor.md b/nanovdb/nanovdb/util/StencilAccessor.md deleted file mode 100644 index 8d9b636f04..0000000000 --- a/nanovdb/nanovdb/util/StencilAccessor.md +++ /dev/null @@ -1,704 +0,0 @@ -# StencilAccessor — Design Plan - -Higher-level wrapper around `BatchAccessor` that owns the straddling loop, -fills complete stencil result arrays, and presents a clean per-block API to -the WENO (or other stencil) kernel. - ---- - -## 1. Purpose - -`StencilAccessor` wraps `BatchAccessor` and owns the full stencil evaluation -for one SIMD-wide batch of voxels. Its output is a fixed-size array of -`Simd` — one vector per tap — containing the ValueOnIndex indices -for all W lanes simultaneously. The caller uses these indices to fetch sidecar -data (floats, etc.) independently; no value arrays are read here. - -``` -input: W voxel offsets + W-wide active mask + center-leaf context -output: Simd × N_taps -``` - -This separates index gathering (StencilAccessor) from value fetching (caller), -which is the right split: index gathering is the expensive irregular part; -value fetching is a straight gather from a dense sidecar array that the caller -can pipeline, prefetch, or vectorise independently. - ---- - -## 2. Relationship to BatchAccessor - -`BatchAccessor::cachedGetValue` produces one `Simd` for -one tap. `StencilAccessor` calls it for every tap in the stencil and assembles -the result array. It also owns: - -- calling `prefetch` for every direction that the batch may cross into -- the **straddling loop**: the `while (any_of(leafMask))` structure that handles - lanes whose center leaf differs from the majority and must be processed - separately before rejoining the batch - ---- - -## 3. Why StencilAccessor must own the stencil — the cache invariant - -`BatchAccessor`'s neighbor cache (`mNeighborLeafIDs[27]`, `mProbedMask`) is valid -only for the current center leaf. Advancing the center leaf invalidates the cache. - -This creates a hard ordering constraint: **all taps must be computed for a given -center leaf before the center leaf advances.** - -If the caller drove the tap loop and called `cachedGetValue` one tap at -a time, it could inadvertently interleave taps across a center-leaf transition, -producing silently wrong results (stale neighbor IDs). - -`StencilAccessor` avoids this by: -1. Holding the complete tap list at compile time. -2. Owning the center-leaf advancement loop. -3. For each center leaf: calling `prefetch` for all needed directions, then - `cachedGetValue` for all taps, before advancing. - -The straddling case makes this constraint sharper: when some lanes cross into a new -center leaf mid-batch, `StencilAccessor` peels those lanes off, runs the **full -stencil** for the new center leaf on the peeled subset, then recombines — all before -yielding the complete result array to the caller. This is only possible because -the full tap list is known upfront. - ---- - -## 4. Compile-time stencil description — `StencilT` - -The stencil is encoded in a `StencilT` policy class passed as a template argument -to `StencilAccessor`. It carries two compile-time sets: - -### 4a. Tap set - -An ordered, sized list of `(di, dj, dk)` offsets. `SIZE` determines the number -of output `Simd` vectors; the index of each tap in the list is its -slot in the output array, so the caller knows which slot corresponds to which offset. - -### 4b. Prefetch hull - -A list of **actual tap offsets** — not normalized `{-1,0,1}³` leaf directions — -that `StencilAccessor` calls `prefetch` on before evaluating any tap. - -The hull is the **minimal set of extreme taps** such that prefetching them -guarantees every `cachedGetValue` call for every stencil tap will find its -neighbor leaf already cached. - -**Why extreme taps suffice — the monotonicity argument:** - -`prefetch` computes, for each lane, which neighbor-leaf direction it -crosses into (encoded as the carry triple `(cx,cy,cz) ∈ {under,in,over}³` from -the SWAR expansion). A crossing in the −x direction occurs when `x + di < 0`, -i.e., when `x < |di|`. For a more extreme tap `hi` with `|hi| ≥ |di|` and the -same sign, `x < |di| ⟹ x < |hi|` — so any lane that the intermediate tap would -cause to cross is **also** detected by the extreme tap. The converse is not true -(the extreme tap may probe a neighbor that the intermediate tap would not reach), -but that is safe: a conservative probe wastes at most one `probeLeaf` call with -no correctness impact. - -**WENO5 (axis-aligned taps, radius 3):** -Lanes can never simultaneously cross two axis boundaries for a single tap, so -edge and corner leaf neighbors are unreachable. The 6 axis-extremal taps are -sufficient: - -``` -hull = { {-3,0,0}, {3,0,0}, - {0,-3,0}, {0,3,0}, - {0,0,-3}, {0,0,3} } -``` - -**3×3×3 box stencil (includes diagonal taps):** -A lane at `(x=0, y=0, z=0)` with tap `(-1,-1,-1)` crosses all three axes at -once, reaching the `(-1,-1,-1)` corner leaf neighbor. The 8 corner taps -`{(±1,±1,±1)}` form the hull: each corner tap, across all lane positions, -generates crossings in every combination of axes within its sign octant, -covering all 26 neighbor directions (faces, edges, and corners). - -**General rule:** the hull = the **sign-octant convex hull vertices** of the -tap set. For axis-aligned stencils these are the axis extremes; for stencils -with diagonal taps these are the corners of the tap set's bounding box in each -octant. - -The hull is **provided explicitly** rather than derived automatically — it is a -one-time design-time decision per stencil type, and it avoids compile-time logic -that would need to reason about leaf size vs. tap radius. - -### 4c. Sketch of `StencilT` concept - -```cpp -// WENO5 3D stencil: 19 taps (center + 6 per axis at ±1,±2,±3), radius 3, -// hull = 6 extremal taps. Tap ordering matches WenoPt::idx in -// nanovdb/math/Stencils.h, so slot k here corresponds to the same physical -// tap as any code using the canonical WenoStencil index convention. -struct Weno5Stencil { - static constexpr int SIZE = 19; - - // ordered tap list: output slot i ↔ taps[i] - static constexpr nanovdb::Coord taps[SIZE] = { - { 0, 0, 0}, - {-3,0,0}, {-2,0,0}, {-1,0,0}, {1,0,0}, {2,0,0}, {3,0,0}, - {0,-3,0}, {0,-2,0}, {0,-1,0}, {0,1,0}, {0,2,0}, {0,3,0}, - {0,0,-3}, {0,0,-2}, {0,0,-1}, {0,0,1}, {0,0,2}, {0,0,3}, - }; - - // prefetch hull: 6 extremal taps cover all 18 non-center taps - // (center never crosses a leaf, so it's excluded from the hull). - static constexpr int HULL_SIZE = 6; - static constexpr nanovdb::Coord hull[HULL_SIZE] = { - {-3,0,0}, {3,0,0}, - {0,-3,0}, {0,3,0}, - {0,0,-3}, {0,0,3}, - }; -}; -``` - -The exact representation (constexpr arrays, parameter packs, index sequences) is -to be refined. The conceptual contract is fixed: `StencilT` exposes `SIZE`, -an indexed tap list, `HULL_SIZE`, and an indexed hull list — all at compile time. - ---- - -## 5. Template parameters and type aliases - -```cpp -template -class StencilAccessor { - - // Scalar/SIMD split — explicit conditional, not Simd degeneracy. - // Matches the convention BatchAccessor already uses for its own template params. - using IndexVec = std::conditional_t>; - using OffsetVec = std::conditional_t>; - using LeafIdVec = std::conditional_t>; - - // Two distinct mask types — they differ in element width and in role: - // - // LeafMaskVec — mask over leafIndex[] (uint32_t) comparisons. - // Used internally in the straddling loop and passed to - // BatchAccessor::prefetch / cachedGetValue. - // - // IndexMaskVec — mask over mIndices[] (uint64_t) values. - // Returned by moveTo so the caller can gate reads from - // Simd stencil result vectors. - // - // In the underlying bitmask representation both are W-bit masks; the type - // distinction exists for semantic correctness when blending or gating on - // 64-bit vs 32-bit SIMD data. A widening reinterpret is needed when - // converting the initial LeafMaskVec activeMask to the IndexMaskVec return. - using LeafMaskVec = std::conditional_t>; - using IndexMaskVec = std::conditional_t>; - - // BatchAccessor is parameterised with LeafMaskVec because prefetch() and - // cachedGetValue() operate in the leaf-ID (uint32_t) domain. - using BatchAcc = std::conditional_t, - BatchAccessor>; - - static constexpr int SIZE = std::tuple_size_v; - static constexpr int HULL_SIZE = std::tuple_size_v; -}; -``` - -W=1 gives a fully scalar `BatchAccessor` underneath with plain scalar `mIndices` — -a clean debug and cross-validation path identical in logic to the SIMD path. - ---- - -## 6. Internal state - -```cpp -BatchAcc mBatch; // owns neighbor-leaf cache, mCenterLeafID, and cachedGetValue -IndexVec mIndices[SIZE]; // one SIMD vector (or scalar) per tap — output store -``` - -**`mBatch`** — the embedded `BatchAccessor`. It is the **single source of truth** -for the current center leaf ID. `BatchAccessor` exposes a `centerLeafID()` getter -so `StencilAccessor::moveTo` can read it for the `leafSlice == currentLeafID` -comparison without maintaining a redundant copy. `StencilAccessor` drives -advancement by calling `mBatch.advance(newLeafID)`. - -`StencilAccessor` has **no separate `mCurrentLeafID` member** — having both -`mBatch.mCenterLeafID` and a local copy would be redundant state that can get -out of sync. - -**`mIndices`** — accumulation buffer filled by `moveTo`. At the **top of each -`moveTo` call**, all `SIZE` vectors are zeroed. Index 0 is the NanoVDB -IndexGrid "not found / background" sentinel, so inactive lanes (those not set -in the returned `IndexMaskVec`) yield a well-defined background index rather -than stale data. Active lanes are then written by the straddling loop via -`where`-blend; in the straddling case the blend ensures majority-leaf results -are not overwritten when minority-leaf lanes are processed. - -**Stack footprint:** for WENO5, W=16: 19 × 16 × 8 bytes = **2.375 KB**. -Acceptable for a stack-local object within a VBM block kernel; would need care -if embedded in a larger persistent structure. - ---- - -## 7. Construction and leaf-ID monotonicity - -```cpp -StencilAccessor(const GridT& grid, uint32_t firstLeafID, uint32_t nExtraLeaves) - : mBatch(grid, firstLeafID) -#ifndef NDEBUG - , mNExtraLeaves(nExtraLeaves) -#endif -{} -``` - -Constructed once per VBM block. `firstLeafID = vbmHandle.hostFirstLeafID()[blockID]` -is the correct starting center leaf — the VBM block begins there by definition. - -`nExtraLeaves` is the number of distinct center-leaf advances the straddling loop -may make across the entire block (computed from the jumpMap by the caller). It is -used only as a debug-mode assert bound; it is not needed for correctness. Once the -implementation is vetted, remove the `#ifndef NDEBUG` member, the assert in `moveTo`, -and the constructor parameter — four targeted deletions with no restructuring. - -**Leaf-ID monotonicity invariant:** The VBM assigns leaf IDs in Morton order. -Within a block, `leafIndex[0..BlockWidth-1]` is **non-decreasing**: as the voxel -index advances, the leaf IDs can only stay the same or increase — never decrease. - -This invariant is load-bearing for the straddling loop: - -- `advance(centerLeafID() + 1)` is always correct: once all lanes for leaf N are - consumed from the current batch, no future batch will ever contain a lane for - leaf N. A simple increment is sufficient; no backward search is needed. -- The `while (any_of(activeMask))` loop is guaranteed to terminate: each iteration - either removes lanes from `activeMask` (progress toward `none_of`) or increments - the center leaf (progress toward the end of the block). At most `nLeaves` - center-leaf advances occur per batch; typically zero or one. -- The `BatchAccessor` neighbor cache is never invalidated "in reverse" — its - monotonic advance matches the monotonic leaf-ID layout. - -The instance persists for the entire block (across all `moveTo` calls) and is -destroyed when the block loop advances to the next block. - ---- - -## 8. `moveTo` — signature and body - -### 8a. Signature - -```cpp -IndexMaskVec moveTo(const uint32_t* leafIndex, // ptr to leafIndex[batchStart] - const uint16_t* voxelOffset); // ptr to voxelOffset[batchStart] -``` - -Takes raw pointers into the block's decoded inverse-map arrays at the current -batch offset. Returns the **initial** active-lane mask — `(leafSlice != -UnusedLeafIndex)` computed before the straddling loop — converted from -`LeafMaskVec` (uint32_t domain) to `IndexMaskVec` (uint64_t domain). - -The returned mask has two simultaneous readings: -- **Validity**: lane `i` held a real voxel (not a padding sentinel). -- **Usability**: `mIndices[k][i]` contains a valid stencil index for lane `i`. - -They are the same predicate because active lanes are written by `cachedGetValue` -and inactive lanes hold 0 (zeroed at the top of `moveTo`). The straddling loop -drains `activeMask` to zero internally; the initial mask is saved separately and -returned so the caller always receives a meaningful result. - -### 8b. Straddling loop body - -Mirrors the `while (any_of(activeMask))` loop from -`ex_stencil_gather_cpu/stencil_gather_cpu.cpp` (lines 698–789): - -``` -moveTo(leafIndex*, voxelOffset*): - - // Zero all tap slots — inactive lanes will hold index 0 (NanoVDB background). - for I in [0, SIZE): mIndices[I] = IndexVec(0) - - leafSlice ← load W values from leafIndex (LeafIdVec) - voVec ← load W values from voxelOffset (OffsetVec) - activeMask ← (leafSlice != UnusedLeafIndex) as LeafMaskVec - - // Save initial mask before the drain loop; this is what we return. - resultMask ← widen(activeMask) as IndexMaskVec - - if none_of(activeMask): return resultMask // entire batch inactive - - // Debug-only advance counter — see §7 for removal instructions. - #ifndef NDEBUG - uint32_t nAdvances = 0 - #endif - - while any_of(activeMask): - - leafMask ← activeMask & (leafSlice == LeafIdVec(mBatch.centerLeafID())) - - if none_of(leafMask): - // No lanes for this leaf — advance to next, assert bound. - mBatch.advance(mBatch.centerLeafID() + 1) - NANOVDB_ASSERT(++nAdvances <= mNExtraLeaves) - continue - - // Prefetch hull (compile-time fold over StencilT::Hull) - for each HullPoint H in StencilT::Hull: - mBatch.prefetch(voVec, leafMask) - - // Compute all taps and blend into mIndices - for each tap I in [0, SIZE): - using P = tuple_element_t - tmp ← IndexVec(0) - mBatch.cachedGetValue(tmp, voVec, leafMask) - where(leafMask, mIndices[I]) = tmp // blend: preserve other lanes - - activeMask &= !leafMask // remove processed lanes - - return resultMask -``` - -The `where`-blend is essential for correctness in straddling batches: lanes -belonging to a second center leaf must not overwrite results already written -for the first center leaf in the same `mIndices` slot. - -Note: `leafMask` is `LeafMaskVec` (uint32_t domain) while `mIndices[I]` is -`IndexVec` (uint64_t). The `where`-blend requires either a widening cast of -`leafMask` to `IndexMaskVec`, or a `where` overload in Simd.h that accepts -cross-width masks. Since both are W-bit masks, this is a bitmask reinterpret -with no data movement. - -### 8c. Hull and tap loops as compile-time folds - -Both loops expand to zero-overhead compile-time instantiations: - -```cpp -// Hull prefetch fold -[this, &voVec, &leafMask](std::index_sequence) { - using Hull = typename StencilT::Hull; - (mBatch.prefetch< - std::tuple_element_t::di, - std::tuple_element_t::dj, - std::tuple_element_t::dk - >(voVec, leafMask), ...); -}(std::make_index_sequence{}); - -// Tap cachedGetValue fold -[this, &voVec, &leafMask](std::index_sequence) { - using Taps = typename StencilT::Taps; - (blendOneTap(voVec, leafMask), ...); -}(std::make_index_sequence{}); -``` - -where `blendOneTap` calls `cachedGetValue` into a -temporary and then `where`-blends into `mIndices[I]`. - -### 8.1 Hybrid SIMD → scalar-tail design and public API - -`StencilAccessor` uses the hybrid design documented in `BatchAccessor.md` -§8i. The straddling loop in `moveTo` and the SWAR / direction-extraction -portion of each tap are SIMD; `BatchAccessor::cachedGetValue` then harvests -per-lane direction and local-offset values into stack C arrays and runs a -scalar loop calling `leaf.getValue(offset)`. Each tap writes directly into -`mIndices[I][0..W-1]` — one scalar `mov` per active lane, no -mask-bool round-trip. - -#### Public API is Simd-free - -| Member | Type | -|--------|:-----| -| `mIndices` (public) | `alignas(64) uint64_t[SIZE][W]` — results buffer, populated by `moveTo()` | -| `moveTo(leafIndex*, voxelOffset*)` | returns `void` | -| `tapIndex()` (static constexpr) | `int` — compile-time tap slot lookup | -| `size()` (static constexpr) | `int` | - -Callers consume `mIndices` directly. Active-lane information comes from -`leafIndex[i] != UnusedLeafIndex` — the same sentinel that `decodeInverseMaps` -produces. No `SimdMask<>` or `Simd<>` appears in the API. - -```cpp -stencilAcc.moveTo(leafIndex + bs, voxelOffset + bs); -for (int i = 0; i < W; ++i) { - if (leafIndex[bs + i] == CPUVBM::UnusedLeafIndex) continue; - // named-tap access (compile-time, reorder-safe): - uint64_t idx_xm3 = stencilAcc.mIndices[SAccT::tapIndex<-3,0,0>()][i]; - // iteration: - for (int k = 0; k < SAccT::size(); ++k) - consume(stencilAcc.mIndices[k][i]); - // SIMD load of tap row using caller's own backend: - auto row = nanovdb::util::Simd(stencilAcc.mIndices[k], - nanovdb::util::element_aligned); -} -``` - -#### Layout is ABI - -`mIndices[SIZE][W]` row-major is part of the contract. Changing it (for -example to `[W][SIZE]` or to a SIMD aggregate) is a breaking change. The -choice matches how the scalar tail produces the data, so "what's written" -and "what's read" share a single authoritative layout. - -#### GCC codegen (short version) - -With the hybrid in place, neither compiler needs `[[gnu::flatten]]` to -reach reasonable performance. Measured at 32 M ambient voxels / 50% / 32 -threads on i9-285K Arrow Lake: GCC 5.1 ns/voxel, Clang 4.9 ns/voxel — -both beat the scalar `LegacyStencilAccessor` oracle (5.5 GCC, 6.7 Clang). -Adding `flatten` on `moveTo` closes the compiler gap to ~4.8 ns/voxel on -both; the 0.3 ns/voxel gain is not worth the 77 KB monolithic body for -default builds. Consumers that need peak GCC performance can still -annotate their own entry point. See `BatchAccessor.md` §8i for the full -perf matrix and the analysis of which operations were kept SIMD vs -scalarized. - -### 8.2 What actually bottlenecks the CPU path — `valueMask.isOn` mispredicts - -A PMU-counter investigation (`BatchAccessor.md` §8j) replaced several rounds -of structural reasoning with hardware measurements. It refutes two -hypotheses that had shaped earlier design discussions and identifies the -one lever that dominates CPU performance. - -#### What we measured - -On a single P-core of an i9-285K (Arrow Lake, 8 P + 16 E, GCC 13 at -`-O3 -march=native`, 32 M-voxel / 50 %-occupancy workload), comparing -per-variant PMU counters for every benchmarking pass exposed via -`ex_stencil_gather_cpu --pass=`: - -| Variant | ns/voxel | IPC | branch-miss | L1 miss | -|---------|---------:|----:|------------:|--------:| -| Degenerate (18 × (0,0,0), CSE'd) | 29.0 | 4.02 | **0.75 %** | 0.41 % | -| center-hit × 18 (Legacy, same-leaf, `cmov`'d) | 19.0 | 4.80 | **0.84 %** | 0.47 % | -| InLeaf (hybrid, 18 distinct same-leaf, no CSE) | 76.6 | 1.45 | **9.87 %** | 0.68 % | -| Stencil (hybrid WENO5 cross-leaf) | 96.9 | 1.53 | **8.75 %** | 0.46 % | -| Legacy (WENO5, full tree walks) | 99.2 | 1.98 | **8.85 %** | 0.40 % | - -#### The two big findings - -1. **L1-dcache miss rates are flat across all variants (~0.4–0.7 %).** - Multi-leaf L1 pressure — the earlier narrative for why cross-leaf taps - cost so much — is **not a factor** on this workload. The neighbour - leaves' `mValueMask` / `mPrefixSum` data stays L1-resident throughout a - VBM block. - -2. **Branch-miss rates split cleanly into two groups**, and the split is - not along tree-walk lines. InLeaf has *no* tree walks (it wraps taps - mod 8 to the centre leaf by construction) but still lands in the "bad" - group at 9.87 % — higher than Legacy. The common factor is the - **`valueMask.isOn(offset)` conditional** inside - `LeafNode::getValue(offset)`: - - ```cpp - if (!(w & mask)) return 0; // data-dependent, ~50/50 outcome, unpredictable - ``` - - Every per-tap leaf lookup in the "bad" group — the hybrid's scalar tail, - Legacy's `legacyAcc[k]`, InLeaf's `cachedGetValueInLeaf` — routes through - this branch. Degenerate escapes it via CSE (18 identical taps collapse - to 1 evaluation). center-hit escapes it because GCC's inliner in that - tight loop emits the guarded return as a branchless `cmov` — an - optimiser accident, not a general property. - -#### Branchless experiment - -A `legacy-branchless` variant that replaces `leaf.getValue(offset)` with -the unconditional formula inlined at the call site (see §8j.5) recovers a -**3× speedup on Legacy**: from 5.6 ns/voxel to 2.0 ns/voxel at 32 threads, -IPC from 1.98 to 4.29, branch-miss rate from 8.07 % to 1.67 %. The -tree-walk machinery (`acc.probeLeaf()`) is preserved in that variant; the -only thing removed is the single `isOn` branch per tap. That single -change accounts for ~65 % of Legacy's total wall-clock time. - -#### Revised attribution of Legacy's 5.4 ns/voxel - -| Component | ns/voxel | -|-----------|---------:| -| Framing (`decodeInverseMaps`, loop, anti-DCE) | 0.25 | -| Leaf-local `getValue` work (loads + `popcnt`) | 0.75 | -| **`valueMask.isOn` branch mispredicts** (~24/voxel × ~15 cy) | **~3.6** | -| Tree walk vs 27-leaf cache differential | ~0.3 | -| Multi-leaf L1 pressure | ~0 | -| **Total** | **~5.4** | - -Earlier versions of this section attributed the bulk of Legacy's cost to -tree-walk pointer chases and multi-leaf L1 traffic; both turned out to -be minor. The hybrid `StencilAccessor` matches Legacy (~5.1 ns/voxel) -because both pay the same dominant `isOn` mispredict cost. - -#### Consequence for architectural decisions - -- **The shipped hybrid design is the right API choice** (Simd-free public - surface, compiler-portable) but its wall-clock edge over Legacy is - marginal (~0.3 ns/voxel), not the ~3 ns/voxel originally implied. -- **The cheap architectural win was a branchless reformulation of - `LeafData::getValue`**: shipped as the default body of - `getValue` in `NanoVDB.h` (see `BatchAccessor.md` §8k), gated by - `NANOVDB_USE_BRANCHY_GETVALUE` to restore the old branchy form. - End-to-end 1.4× on realistic narrow-band workloads, 2.8× on - random-access. - -See `BatchAccessor.md` §8j for the original measurement matrix and -correction log (§8g/§8h/§8i), and `BatchAccessor.md` §8k for the -follow-on that made `getValue` branchless-by-default, added the -narrow-band validation benchmark (`ex_narrowband_stencil_cpu`), and -the leaf-only `ReadAccessor` finding. - ---- - -## 9. `tapIndex()` — compile-time slot lookup, `mIndices[][]` access - -> **API evolution.** Earlier drafts of this document described a -> `getValue() const → const IndexVec&` member and an -> `operator[](int) → const IndexVec&` accessor. Both were removed in the -> hybrid refactor (§8.1). The results buffer is now a plain public 2D -> C array; callers pick their own access pattern. The change aligns with -> the hybrid's Simd-free public API — no `Simd<>` or `SimdMask<>` type -> appears in the class's public interface. - -```cpp -// Storage — public, part of the ABI: -alignas(64) uint64_t mIndices[SIZE][W]; - -// Compile-time slot lookup (reorder-safe, zero runtime cost): -template -static constexpr int tapIndex() { - constexpr int I = detail::findIndex( - std::make_index_sequence{}); - static_assert(I >= 0, "StencilAccessor::tapIndex: tap not in stencil"); - return I; -} - -// Iteration bound: -static constexpr int size() { return SIZE; } -``` - -**Inverse map** (`detail::findIndex`): a `constexpr` fold over all `SIZE` -taps, comparing `(DI,DJ,DK)` against each `StencilPoint`. O(N) compile-time -evaluations — negligible for realistic stencil sizes. Resolved entirely at -compile time; `tapIndex<-3,0,0>()` compiles to an integer literal. - -**`static_assert`**: catches invalid tap coordinates at compile time with a -clear message. Same safety guarantee as OpenVDB stencil's bounds check. - -**Lifetime**: `mIndices` is valid only until the next `moveTo` call. The -caller must not cache references across batches. - -**Why expose `mIndices` directly** (rather than a method that returns it): -the results buffer is plain data — no lazy work, no layout translation, no -invariants to enforce. Hiding it behind an accessor would pretend -otherwise. Direct access also lets callers choose their SIMD load pattern -(or scalar iteration) without our API imposing one. - ---- - -## 10. Caller-side usage pattern - -```cpp -// Construct once per VBM block. -StencilAccessor stencil( - grid, vbm.firstLeafID(blockID), nExtraLeaves); - -// Active-lane information comes from decodeInverseMaps's UnusedLeafIndex -// sentinel — the same source that StencilAccessor uses internally. -for (int bs = 0; bs < BlockWidth; bs += W) { - stencil.moveTo(leafIndex + bs, voxelOffset + bs); // returns void - - // Option A: scalar iteration across lanes and taps. - for (int i = 0; i < W; ++i) { - if (leafIndex[bs + i] == UnusedLeafIndex) continue; - for (int k = 0; k < StencilAccessor::size(); ++k) { - consume(stencil.mIndices[k][i]); // uint64_t - } - } - - // Option B: SIMD load of an entire tap row (caller picks backend/width). - auto row_m3 = util::Simd( - stencil.mIndices[stencilAccT::tapIndex<-3, 0, 0>()], - util::element_aligned); - - // Option C: compile-time named tap access for a handful of taps. - const uint64_t& xm3 = stencil.mIndices[stencilAccT::tapIndex<-3,0,0>()][i]; -} -// stencil destroyed here (end of block scope) -``` - -No `Simd<>` or `SimdMask<>` types appear in the public API. The caller -uses its own SIMD backend (or none) to consume `mIndices`. - ---- - -## 11. Ownership summary - -| Concern | Owner | -|---------|-------| -| Neighbor-leaf cache (`mNeighborLeafIDs[27]`, `mProbedMask`) | `BatchAccessor` | -| Cache population | `BatchAccessor::prefetch` (called by `StencilAccessor`) | -| Cache invalidation | `BatchAccessor` constructor + `advance()` — both clear `mProbedMask` and set `mCenterLeafID`; neither rebuilds the cache | -| `cachedGetValue` | `BatchAccessor` (called by `StencilAccessor`) | -| `advance(newLeafID)` | `BatchAccessor` — this is the only legitimate setter for `mCenterLeafID`; no raw setter exists (would bypass cache invalidation) | -| `mCenterLeafID` read access | `BatchAccessor::centerLeafID()` getter — exposed to `StencilAccessor`; no external setter | -| `leafMask` computation | `StencilAccessor` (derived inside `moveTo`) | -| Straddling loop | `StencilAccessor` | -| Hull prefetch sequencing | `StencilAccessor` | -| Tap fold (writes directly into `mIndices[Is]`) | `StencilAccessor::calcTaps` | -| `mIndices[SIZE][W]` storage and zeroing | `StencilAccessor` (public member; `std::memset` at top of each `moveTo`) | -| `nExtraLeaves` debug bound | `StencilAccessor` (`#ifndef NDEBUG` member; removable) | -| Center-leaf lifetime (block scope) | Caller | - ---- - -## 12. Design decisions (all resolved) - -> **Evolution.** Decisions 1 and 3 below have been superseded by the -> hybrid refactor (§8.1): `moveTo` now returns `void`, and `operator[]` / -> `getValue<>()` were removed in favour of public `mIndices` access + -> `tapIndex<>()`. The original rationales are preserved for historical -> context; the current API is §9's. - -1. **`moveTo` return type — ~~`IndexMaskVec` by value~~ `void` (revised §8.1).** - *Original rationale:* The initial - `activeMask = (leafSlice != UnusedLeafIndex)` was saved before the - straddling loop drains it to zero, widened from `LeafMaskVec` (uint32_t) - to `IndexMaskVec` (uint64_t), and returned. This gave the caller a mask - semantically aligned with the uint64_t `mIndices` data. - *Revised:* `moveTo` now returns `void`. The active-lane information is - redundant: callers already have `leafIndex[]` from `decodeInverseMaps` - and the same `UnusedLeafIndex` sentinel that `StencilAccessor` uses - internally. Returning the mask duplicated state and forced a - heterogeneous `SimdMask` → `SimdMask` widening with - a boolean round-trip (§8h) — all for zero information gain. Removing - it also eliminated the last `SimdMask<>` type from the public API. - -2. **Inactive-lane `mIndices` values — zeroed at top of `moveTo`.** - `mIndices` is set to zero (via `std::memset`) at the start of every - `moveTo` call. Index 0 is the NanoVDB IndexGrid "not found / background" - sentinel, so inactive lanes yield a well-defined background index rather - than stale data. The cost is a single `memset` of `SIZE * W * 8` bytes - per call (2304 B for WENO5 W=16), which stays in L1 and pipelines under - other work. - -3. **~~`operator[]` — public, const-ref, no bounds check~~ removed (revised §9).** - *Original:* `const IndexVec& operator[](int i) const { return mIndices[i]; }` - for kernels that iterate over all taps generically. - *Revised:* `mIndices` is now a public member (§9); direct indexing - replaces both `operator[](int)` and `getValue()`. Named-tap - access is via the `tapIndex()` static constexpr slot lookup. - This change is consistent with the hybrid's Simd-free public API — no - method can now return a `Simd<>` or `SimdMask<>` reference. - -4. **`StencilT` representation — `std::tuple...>` for both - `Taps` and `Hull`.** - The compile-time fold in §8c requires `std::tuple_element_t::di` to - be a compile-time constant. This is clean with a tuple-of-types but not with a - constexpr array indexed by a template parameter. `std::tuple_size_v` and - `std::tuple_element_t` are the sole introspection mechanisms needed. - -5. **`BatchAccessor::centerLeafID()` getter — add; no raw setter.** - ```cpp - uint32_t centerLeafID() const { return mCenterLeafID; } - ``` - The only change required in `BatchAccessor`. No raw setter: `advance()` is - the sole legitimate state transition for `mCenterLeafID`. Both the constructor - and `advance()` only **invalidate** the cache (clear `mProbedMask`); they do - not rebuild it. Cache population is entirely the caller's responsibility via - `prefetch()`, called by `StencilAccessor` inside the straddling loop. - -6. **`nExtraLeaves` — kept as a removable debug sanity check.** - Passed to the constructor, stored as `#ifndef NDEBUG uint32_t mNExtraLeaves` - member, asserted against a local `nAdvances` counter on each `advance()` call - inside `moveTo`. Termination is guaranteed by the VBM monotonicity invariant - (§7) without this bound; the bound is belt-and-suspenders only. To remove once - vetted: delete the `#ifndef NDEBUG` member block, the assert line, and the - `nExtraLeaves` constructor parameter — four targeted deletions. diff --git a/nanovdb/nanovdb/util/WenoStencil.h b/nanovdb/nanovdb/util/WenoStencil.h index e3fe994ccb..5f5ea7fc58 100644 --- a/nanovdb/nanovdb/util/WenoStencil.h +++ b/nanovdb/nanovdb/util/WenoStencil.h @@ -32,9 +32,6 @@ ascending-|d| order so the inner point is already resolved. - normSqGrad(isoValue = 0) Godunov's norm-square of the fifth-order WENO upwind gradient. - - See BatchAccessor.md Sec. 11 for the full Phase-2 sidecar-WENO pipeline design - and Sec. 11.2 for the extrapolation semantics. */ #pragma once @@ -156,7 +153,7 @@ GodunovsNormSqrd(MaskType isOutside, // stencil state. Holds ValueType-typed values + MaskType-typed activity // flags + scalar grid constants. Fill-side responsibility (scalar writes // into any raw buffers, followed by a per-point load into this stencil's -// values[] / isActive[]) lives in the caller. See WenoStencil.md Sec. 6 +// values[] / isActive[]) lives in the caller. See WenoStencil.md // for usage patterns. // --------------------------------------------------------------------------- template @@ -214,6 +211,18 @@ class WenoStencil return I; } + // Resolve all SIZE stencil-point indices for the voxel at @a center via + // Acc::getValue(center + offset). Indices land in out[0..SIZE-1] in the + // StencilPoints tuple ordering. Acc is any NanoVDB accessor whose + // getValue() returns a value convertible to uint64_t (e.g. ValueOnIndex's + // sequential active-voxel indices). The accessor's path cache is reused + // across the SIZE getValue calls. + template + static void gatherIndices(Acc& acc, const Coord& center, uint64_t* out) + { + gatherIndicesImpl(acc, center, out, std::make_index_sequence{}); + } + // ------------------------------------------------------------------ // extrapolate -- sign-correct out-of-band lanes (isActive[k][i] == false) // of values[k] by multiplying with Sign(values[innerPoint][i]). Active @@ -241,7 +250,7 @@ class WenoStencil // Godunov's upwind combinator driven by the sign of (center - iso). // // Call only after the stencil has been populated (see usage pattern in - // WenoStencil.md Sec. 6). extrapolate() before normSqGrad() is the + // WenoStencil.md). extrapolate() before normSqGrad() is the // typical pipeline shape but is not required by this method. // ------------------------------------------------------------------ __hostdev__ NANOVDB_FORCEINLINE ValueType normSqGrad(float iso = 0.f) const; @@ -261,6 +270,18 @@ class WenoStencil return result; } + // Parameter-pack expansion driving gatherIndices(): unrolls SIZE getValue + // calls into a single fold expression. + template + static void gatherIndicesImpl(Acc& acc, const Coord& center, uint64_t* out, + std::index_sequence) + { + ((out[Is] = static_cast(acc.getValue(center + Coord( + std::tuple_element_t::di, + std::tuple_element_t::dj, + std::tuple_element_t::dk)))), ...); + } + // Hardcoded (point, innerPoint) pairs for the 19-point StencilPoints // tuple, ordered by ascending |d| so the inner point is always already // resolved when the outer point is processed. Indices match the diff --git a/nanovdb/nanovdb/util/WenoStencil.md b/nanovdb/nanovdb/util/WenoStencil.md index a68c88a335..cedd05f63a 100644 --- a/nanovdb/nanovdb/util/WenoStencil.md +++ b/nanovdb/nanovdb/util/WenoStencil.md @@ -2,16 +2,14 @@ Design reference for `nanovdb/nanovdb/util/WenoStencil.h`. Captures the rationale behind the templated-on-lane-width class, the out-of-band -extrapolation algorithm, the Godunov norm-square-gradient method, and the -relationship to the broader Phase-2 pipeline sketched in -`BatchAccessor.md §11`. +extrapolation algorithm, and the Godunov norm-square-gradient method. --- ## 1. Motivation -The WENO5 CPU pipeline (`BatchAccessor.md §11`) assembles, per voxel -batch, a 19-tap value matrix with per-tap activity flags: +The WENO5 CPU pipeline assembles, per voxel batch, a 19-tap value +matrix with per-tap activity flags: ``` float values[Ntaps][W] -- real sidecar value, or background for OOB lanes @@ -418,16 +416,7 @@ scalar pattern fits its source/sink. ## 7. Future work -### 7.1 Measurement — lock in the perf numbers - -Reconstruct()-path (normSqGrad) cost hasn't been measured yet. Next -step: add a `sidecar-stencil-normsqgrad` benchmark pass in -`ex_narrowband_stencil_cpu` to drive normSqGrad to completion on -taperLER.vdb; compare against `sidecar-stencil-extrap` (which writes -the tap-sum instead of normSqGrad) to isolate the Phase-3 arithmetic -cost. - -### 7.2 Alternative stencils +### 7.1 Alternative stencils If/when Weno7 or a non-axis-aligned stencil is needed, the class would specialise on a stencil-policy template parameter rather than @@ -443,20 +432,3 @@ constexpr pass that finds, for each tap, the same-axis neighbour with |Δ| = |tap.Δ| − 1. Not needed until a second axis-aligned stencil exists. ---- - -## 8. Relationship to other design docs - -- **`BatchAccessor.md §11`** — the broader Phase-2/3 pipeline plan - (VBM decode → sidecar assembly → extrapolation → WENO arithmetic - → write-back). `WenoStencil` implements the extrapolation and - (now) the WENO arithmetic steps; the storage carries data across - from sidecar-assembly. -- **`StencilAccessor.md`** — Phase-1 accessor (batched uint64 index - gather). `StencilAccessor` fills `mIndices[SIZE][W]`; callers - consume those indices (via `sidecar[idx]` in their fill loops) and - populate `WenoStencil::values[]` / `isActive[]`. -- **`nanovdb/math/Stencils.h`** — the scalar ground-truth for WENO5 - and Godunov. `WenoStencil::normSqGrad()` is a line-for-line - transliteration of `nanovdb::math::WenoStencil::normSqGrad()` - to generic-T form. From 0c8b58279d2f8de0f7a59d22451a2fbf71e59f95 Mon Sep 17 00:00:00 2001 From: Efty Sifakis Date: Wed, 29 Apr 2026 14:16:29 -0500 Subject: [PATCH 60/60] nanovdb/util/ForEach.h: drop hardware_concurrency()>>1 in std::thread fallback The fallback was using only half of the hardware threads. On modern hybrid CPUs (e.g. P+E without HT) this halves available parallelism for no benefit; just use hardware_concurrency() directly. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Efty Sifakis --- nanovdb/nanovdb/util/ForEach.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nanovdb/nanovdb/util/ForEach.h b/nanovdb/nanovdb/util/ForEach.h index d71769c5ab..6ede538a10 100644 --- a/nanovdb/nanovdb/util/ForEach.h +++ b/nanovdb/nanovdb/util/ForEach.h @@ -45,7 +45,7 @@ inline void forEach(RangeT range, const FuncT &func) #ifdef NANOVDB_USE_TBB tbb::parallel_for(range, func); #else// naive and likely slow alternative based on std::thread - if (const size_t threadCount = std::thread::hardware_concurrency()>>1) { + if (const size_t threadCount = std::thread::hardware_concurrency()) { std::vector rangePool{ range }; while(rangePool.size() < threadCount) { const size_t oldSize = rangePool.size();