docs

0ax1 · 0ax1 · commit 637a07d3223e · 2026-06-17T15:43:50.000Z
Signed-off-by: Alexander Droste &lt;alexander.droste@protonmail.com&gt;
diff --git a/vortex-cuda/kernels/src/arrow_validity.cu b/vortex-cuda/kernels/src/arrow_validity.cu
@@ -7,8 +7,9 @@
 
 namespace {
 
-// Load the `word_idx`-th little-endian u64 of `input`, treating bytes outside
-// `[0, input_bytes)` as zero. `input` must be 8-byte aligned.
+// Transform up to 8 input bytes into a zero-extended 64-bit word:
+//
+//   [ b0 ][ b1 ][ b2 ] | end  ->  [ b0 ][ b1 ][ b2 ][ 00 ][ 00 ][ 00 ][ 00 ][ 00 ]
 __device__ uint64_t load_input_word(const uint8_t *const input, int64_t word_idx, uint64_t input_bytes) {
     if (word_idx < 0) {
         return 0;
@@ -28,11 +29,17 @@ __device__ uint64_t load_input_word(const uint8_t *const input, int64_t word_idx
     return word;
 }
 
-// Build one 64-bit word of the Arrow validity bitmap.
+// Build one output word for sliced validity. The row bits are the same, but
+// row 0 may live at a different bit position in the source and Arrow bitmaps.
+// For example, `input_offset = 5` and `arrow_offset = 0` shifts row0 from bit 5
+// in the input bitmap to bit 0 in the Arrow bitmap.
+//
+//   input bitmap:  [ . ][ . ][ . ][ . ][ . ][ row0 ][ row1 ][ row2 ]....
+//                                            ^ input_offset
+//   Arrow bitmap:  [ row0 ][ row1 ][ row2 ]....
+//                     ^ arrow_offset
 //
-// Output bit `b` for `b` in `[arrow_offset, validity_bits)` equals input bit `b + shift`;
-// all other bits are zero. Two adjacent input words are funnel-shifted to align the input
-// bits with the output word, then the leading/trailing edges are masked.
+// Padding bits are cleared so word-sized validity readers can safely over-read.
 __device__ uint64_t repack_word(const uint8_t *const input,
                                 uint64_t word_idx,
                                 int64_t shift,
@@ -56,115 +63,129 @@ __device__ uint64_t repack_word(const uint8_t *const input,
         return 0;
     }
 
-    // `>> 6` floors also for negative bit positions, unlike `/ 64` which truncates toward zero.
-    const int64_t input_bit = static_cast<int64_t>(word_start) + shift;
-    const int64_t input_word = input_bit >> 6;
-    const uint32_t bit = static_cast<uint32_t>(input_bit & 63);
+    // Each output bit `b` reads source bit `b + shift`.
+    // `>> 6` floors for negative positions, unlike `/ 64` which truncates toward zero.
+    const int64_t source_bit_start = static_cast<int64_t>(word_start) + shift;
+    const int64_t source_word = source_bit_start >> 6;
+    const uint32_t source_bit = static_cast<uint32_t>(source_bit_start & 63);
 
-    const uint64_t lo = load_input_word(input, input_word, input_bytes);
-    if (bit == 0) {
+    const uint64_t lo = load_input_word(input, source_word, input_bytes);
+    if (source_bit == 0) {
         return lo & mask;
     }
-    const uint64_t hi = load_input_word(input, input_word + 1, input_bytes);
-    return ((lo >> bit) | (hi << (64 - bit))) & mask;
+    const uint64_t hi = load_input_word(input, source_word + 1, input_bytes);
+    return ((lo >> source_bit) | (hi << (64 - source_bit))) & mask;
 }
 
-// Rebuild a possibly bit-offset Vortex validity bitmap into an Arrow-compatible bitmap.
-//
-// `input_offset` is the bit offset into `input`; `arrow_offset` is the logical Arrow array offset
-// to preserve in the output. Bits outside `[arrow_offset, arrow_offset + len)` are left unset.
-// The output allocation must hold `ceil((len + arrow_offset) / 64)` full 64-bit words; every
-// word is written, so no zero-initialization of the output is required.
-__device__ void arrow_validity_repack_device(const uint8_t *const input,
-                                             uint64_t *const output,
-                                             uint64_t len,
-                                             uint64_t input_offset,
-                                             uint64_t arrow_offset,
-                                             uint64_t input_bytes) {
-    // One worker owns a contiguous range of output words. Each word is rebuilt locally so
-    // there are no cross-thread bit writes or atomics.
-    const uint64_t worker = blockIdx.x * blockDim.x + threadIdx.x;
-    const uint64_t validity_bits = len + arrow_offset;
-    const uint64_t output_words = (validity_bits + 63) / 64;
-    const uint64_t stride = static_cast<uint64_t>(gridDim.x) * blockDim.x;
-
-    // Translate Arrow-visible output bits back to source bitmap bits. The source bitmap may
-    // start at any bit offset, while Arrow's buffer pointer is byte-addressed.
-    const int64_t shift = static_cast<int64_t>(input_offset) - static_cast<int64_t>(arrow_offset);
-
-    for (uint64_t word_idx = worker; word_idx < output_words; word_idx += stride) {
-        output[word_idx] = repack_word(input, word_idx, shift, arrow_offset, validity_bits, input_bytes);
-    }
-}
+constexpr uint32_t WARP_SIZE = 32;
+constexpr uint32_t FULL_WARP_MASK = 0xffffffff;
 
+// First reduction step for the count kernel: sum one value per lane so each
+// warp produces a single partial count.
+//
+//   lanes:  [a][b][c][d]... -> lane 0: a+b+c+d+...
 __device__ uint64_t warp_sum(uint64_t value) {
-    for (int offset = 16; offset > 0; offset >>= 1) {
-        value += __shfl_down_sync(0xffffffff, value, offset);
+    for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1) {
+        value += __shfl_down_sync(FULL_WARP_MASK, value, offset);
     }
     return value;
 }
 
-__device__ void arrow_validity_count_valid_device(const uint8_t *const input,
-                                                  uint64_t *const output,
-                                                  uint64_t len,
-                                                  uint64_t arrow_offset) {
-    __shared__ uint64_t warp_counts[32];
+// Mask one bitmap byte down to actual rows. This keeps null counting from
+// including Arrow offset padding or trailing padding bits.
+//
+//   byte bits:  [ pad ][ row ][ row ][ row ][ pad ]
+//   mask:       [  0  ][  1  ][  1  ][  1  ][  0  ]
+__device__ uint32_t arrow_validity_byte_mask(uint64_t byte_idx,
+                                             uint64_t arrow_offset,
+                                             uint64_t validity_bits) {
+    const uint64_t byte_start = byte_idx * 8;
 
-    const uint32_t thread = threadIdx.x;
-    const uint64_t worker = blockIdx.x * blockDim.x + thread;
-    const uint64_t validity_bits = len + arrow_offset;
-    const uint64_t input_bytes = (validity_bits + 7) / 8;
-    const uint64_t stride = static_cast<uint64_t>(gridDim.x) * blockDim.x;
+    uint32_t mask = 0xff;
+    if (byte_start < arrow_offset) {
+        const uint64_t lead = arrow_offset - byte_start;
+        mask = lead >= 8 ? 0 : mask << lead;
+    }
 
-    uint64_t valid_count = 0;
-    for (uint64_t byte_idx = worker; byte_idx < input_bytes; byte_idx += stride) {
-        const uint64_t byte_start = byte_idx * 8;
-        uint32_t mask = 0xff;
-        if (byte_start < arrow_offset) {
-            const uint64_t lead = arrow_offset - byte_start;
-            mask = lead >= 8 ? 0 : mask << lead;
-        }
-        const uint64_t remaining = validity_bits - byte_start;
-        if (remaining < 8) {
-            mask &= (uint32_t {1} << remaining) - 1;
-        }
-        valid_count += __popc(static_cast<uint32_t>(input[byte_idx]) & mask);
+    const uint64_t remaining = validity_bits - byte_start;
+    if (remaining < 8) {
+        mask &= (uint32_t {1} << remaining) - 1;
     }
+    return mask;
+}
 
-    const uint32_t lane = thread & 31;
-    const uint32_t warp = thread >> 5;
-    valid_count = warp_sum(valid_count);
+// Combine warp partial counts into one block total. Only thread 0 returns a
+// non-zero value so the count kernel does one global atomic per block.
+//
+//   per-thread counts -> per-warp sums -> block sum -> atomicAdd
+__device__ uint64_t block_sum_to_thread_zero(uint64_t value, uint64_t *const warp_counts) {
+    const uint32_t thread = threadIdx.x;
+    const uint32_t lane = thread & (WARP_SIZE - 1);
+    const uint32_t warp = thread / WARP_SIZE;
+    const uint32_t block_warps = (blockDim.x + WARP_SIZE - 1) / WARP_SIZE;
+
+    value = warp_sum(value);
     if (lane == 0) {
-        warp_counts[warp] = valid_count;
+        warp_counts[warp] = value;
     }
     __syncthreads();
 
-    valid_count = thread < (blockDim.x + 31) / 32 ? warp_counts[lane] : 0;
-    if (warp == 0) {
-        valid_count = warp_sum(valid_count);
-        if (lane == 0) {
-            atomicAdd(reinterpret_cast<unsigned long long *>(output),
-                      static_cast<unsigned long long>(valid_count));
-        }
-    }
+    value = lane < block_warps ? warp_counts[lane] : 0;
+    value = warp == 0 ? warp_sum(value) : 0;
+    return thread == 0 ? value : 0;
 }
 
 } // namespace
 
-// CUDA entry point for validity bitmap repacking used by Arrow Device export.
+// Repack sliced validity when the source bitmap offset does not match the
+// Arrow array offset. Each thread writes independent output words.
+//
+//   thread 0 -> output word 0, word N, ...
+//   thread 1 -> output word 1, word N+1, ...
 extern "C" __global__ void arrow_validity_repack(const uint8_t *const input,
                                                  uint64_t *const output,
                                                  uint64_t len,
                                                  uint64_t input_offset,
                                                  uint64_t arrow_offset,
                                                  uint64_t input_bytes) {
-    arrow_validity_repack_device(input, output, len, input_offset, arrow_offset, input_bytes);
+    const uint64_t worker = blockIdx.x * blockDim.x + threadIdx.x;
+    const uint64_t validity_bits = len + arrow_offset;
+    const uint64_t output_words = (validity_bits + 63) / 64;
+    const uint64_t stride = static_cast<uint64_t>(gridDim.x) * blockDim.x;
+    const int64_t shift = static_cast<int64_t>(input_offset) - static_cast<int64_t>(arrow_offset);
+
+    for (uint64_t word_idx = worker; word_idx < output_words; word_idx += stride) {
+        output[word_idx] = repack_word(input, word_idx, shift, arrow_offset, validity_bits, input_bytes);
+    }
 }
 
-// Kernel entry point for counting valid bits in an Arrow validity bitmap.
+// Count valid rows directly from the device bitmap so Arrow export can provide
+// an exact null_count without copying validity to the CPU.
+//
+//   bytes -> mask padding -> popcount -> block sum -> global count
 extern "C" __global__ void arrow_validity_count_valid(const uint8_t *const input,
                                                       uint64_t *const output,
                                                       uint64_t len,
                                                       uint64_t arrow_offset) {
-    arrow_validity_count_valid_device(input, output, len, arrow_offset);
+    __shared__ uint64_t warp_counts[WARP_SIZE];
+
+    const uint64_t validity_bits = len + arrow_offset;
+    const uint64_t input_bytes = (validity_bits + 7) / 8;
+    const uint64_t worker = blockIdx.x * blockDim.x + threadIdx.x;
+    const uint64_t stride = static_cast<uint64_t>(gridDim.x) * blockDim.x;
+
+    // Grid-stride over bitmap bytes. Each byte contributes the popcount of only
+    // row bits; leading Arrow offset bits and trailing padding bits are masked out.
+    uint64_t valid_count = 0;
+    for (uint64_t byte_idx = worker; byte_idx < input_bytes; byte_idx += stride) {
+        const uint32_t mask = arrow_validity_byte_mask(byte_idx, arrow_offset, validity_bits);
+        valid_count += __popc(static_cast<uint32_t>(input[byte_idx]) & mask);
+    }
+
+    // Reduce within the block first so global contention is one atomic add per block.
+    valid_count = block_sum_to_thread_zero(valid_count, warp_counts);
+    if (threadIdx.x == 0) {
+        atomicAdd(reinterpret_cast<unsigned long long *>(output),
+                  static_cast<unsigned long long>(valid_count));
+    }
 }
diff --git a/vortex-cuda/src/arrow/canonical.rs b/vortex-cuda/src/arrow/canonical.rs
@@ -778,6 +778,8 @@ pub(super) async fn export_arrow_validity_buffer(
     let validity = execute_validity_cuda(validity, len, ctx).await?;
     match validity {
         Validity::NonNullable | Validity::AllValid => Ok((None, 0)),
+        // For non-Null Arrow layouts, callers still export the normal value buffers.
+        // This only marks every row null via buffer 0, the validity bitmap.
         Validity::AllInvalid => Ok((
             Some(device_zeroed_byte_buffer(
                 validity_bitmap_byte_len(len, arrow_offset)?,
@@ -832,7 +834,7 @@ fn device_zeroed_byte_buffer(
     Ok(BufferHandle::new_device(Arc::new(CudaDeviceBuffer::new(buffer))).slice(0..byte_len))
 }
 
-pub(super) fn count_arrow_validity_nulls(
+pub fn count_arrow_validity_nulls(
     bitmap: &BufferHandle,
     len: usize,
     arrow_offset: usize,
@@ -894,7 +896,7 @@ pub(super) fn count_arrow_validity_nulls(
 /// plus an array offset, so sliced compact exports need a GPU rewrite when either side has a
 /// bit-level offset. The kernel writes the output one 64-bit word at a time, funnel-shifting two
 /// adjacent input words, so the allocation is padded to whole words (zeroed by the edge masks).
-pub(super) fn repack_arrow_validity_buffer(
+pub fn repack_arrow_validity_buffer(
     input_buffer: &BufferHandle,
     input_offset: usize,
     len: usize,
@@ -3161,7 +3163,7 @@ mod tests {
     // Non-canonical row validity should export as a device-resident bitmap.
     #[crate::test]
     async fn test_export_struct_non_canonical_validity() -> VortexResult<()> {
-        let mut ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
+        let mut ctx = CudaSession::create_execution_ctx(&crate::cuda_session())
             .vortex_expect("failed to create execution context");
 
         let validity = DictArray::try_new(
diff --git a/vortex-cuda/src/arrow/mod.rs b/vortex-cuda/src/arrow/mod.rs
@@ -66,33 +66,8 @@ pub use arrow_c_abi::ArrowDeviceType;
 #[cfg(feature = "_test-harness")]
 #[doc(hidden)]
 pub mod test_harness {
-    use vortex::array::buffer::BufferHandle;
-    use vortex::error::VortexResult;
-
-    use crate::CudaExecutionCtx;
-    use crate::arrow::canonical::count_arrow_validity_nulls as count_arrow_validity_nulls_impl;
-    use crate::arrow::canonical::repack_arrow_validity_buffer as repack_arrow_validity_buffer_impl;
-
-    /// Count null bits in an Arrow validity bitmap.
-    pub fn count_arrow_validity_nulls(
-        bitmap: &BufferHandle,
-        len: usize,
-        arrow_offset: usize,
-        ctx: &mut CudaExecutionCtx,
-    ) -> VortexResult<i64> {
-        count_arrow_validity_nulls_impl(bitmap, len, arrow_offset, ctx)
-    }
-
-    /// Repack a validity bitmap into Arrow's byte-addressed bitmap layout on the active stream.
-    pub fn repack_arrow_validity_buffer(
-        input_buffer: &BufferHandle,
-        input_offset: usize,
-        len: usize,
-        arrow_offset: usize,
-        ctx: &mut CudaExecutionCtx,
-    ) -> VortexResult<BufferHandle> {
-        repack_arrow_validity_buffer_impl(input_buffer, input_offset, len, arrow_offset, ctx)
-    }
+    pub use crate::arrow::canonical::count_arrow_validity_nulls;
+    pub use crate::arrow::canonical::repack_arrow_validity_buffer;
 }
 
 /// CUDA device memory.