vortex-data
diff --git a/‎Cargo.lock‎
Lines changed: 1 addition & 0 deletions b/‎Cargo.lock‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎vortex-cuda/Cargo.toml‎
Lines changed: 5 additions & 0 deletions b/‎vortex-cuda/Cargo.toml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎vortex-cuda/benches/fsst_cuda.rs‎
Lines changed: 89 additions & 0 deletions b/‎vortex-cuda/benches/fsst_cuda.rs‎
Lines changed: 89 additions & 0 deletions
diff --git a/‎vortex-cuda/kernels/src/fsst.cu‎
Lines changed: 190 additions & 0 deletions b/‎vortex-cuda/kernels/src/fsst.cu‎
Lines changed: 190 additions & 0 deletions
@@ -50,6 +50,7 @@ rstest = { workspace = true }
 tokio = { workspace = true, features = ["rt", "macros"] }
 vortex-array = { workspace = true, features = ["_test-harness"] }
 vortex-cuda = { path = ".", features = ["_test-harness"] }
+vortex-fsst = { workspace = true, features = ["_test-harness"] }
 
 [build-dependencies]
 bindgen = { workspace = true }
@@ -94,3 +95,7 @@ harness = false
 [[bench]]
 name = "throughput_cuda"
 harness = false
+
+[[bench]]
+name = "fsst_cuda"
+harness = false
@@ -0,0 +1,89 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! CUDA benchmarks for FSST decompression.
+
+#![expect(clippy::unwrap_used)]
+#![expect(clippy::cast_possible_truncation)]
+
+mod bench_config;
+mod timed_launch_strategy;
+
+use std::sync::Arc;
+use std::sync::atomic::Ordering;
+use std::time::Duration;
+
+use criterion::BenchmarkId;
+use criterion::Criterion;
+use criterion::Throughput;
+use futures::executor::block_on;
+use vortex::array::IntoArray;
+use vortex::array::arrays::PrimitiveArray;
+use vortex::array::match_each_integer_ptype;
+use vortex::encodings::fsst::FSSTArrayExt;
+use vortex::error::VortexExpect;
+use vortex::session::VortexSession;
+use vortex_cuda::CudaSession;
+use vortex_cuda::executor::CudaArrayExt;
+use vortex_cuda_macros::cuda_available;
+use vortex_cuda_macros::cuda_not_available;
+use vortex_fsst::test_utils::make_fsst_clickbench_urls;
+
+use crate::timed_launch_strategy::TimedLaunchStrategy;
+
+fn benchmark_fsst_cuda_decompress(c: &mut Criterion) {
+    let mut group = c.benchmark_group("cuda");
+
+    for &(n, len_str) in bench_config::BENCH_SIZES {
+        let mut setup_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
+            .vortex_expect("failed to create execution context");
+        let fsst = make_fsst_clickbench_urls(n, setup_ctx.execution_ctx());
+
+        let lens = fsst
+            .uncompressed_lengths()
+            .clone()
+            .execute::<PrimitiveArray>(setup_ctx.execution_ctx())
+            .vortex_expect("canonicalize uncompressed_lengths");
+        let total_size: usize = match_each_integer_ptype!(lens.ptype(), |P| {
+            lens.as_slice::<P>().iter().map(|x| *x as usize).sum()
+        });
+        let uncompressed_size = total_size as u64;
+
+        let fsst_array = fsst.into_array();
+
+        group.throughput(Throughput::Bytes(uncompressed_size));
+        group.bench_with_input(
+            BenchmarkId::new("cuda/fsst/decompress", len_str),
+            &fsst_array,
+            |b, fsst_array| {
+                b.iter_custom(|iters| {
+                    let timed = TimedLaunchStrategy::default();
+                    let timer = timed.timer();
+
+                    let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
+                        .vortex_expect("failed to create execution context")
+                        .with_launch_strategy(Arc::new(timed));
+
+                    for _ in 0..iters {
+                        block_on(fsst_array.clone().execute_cuda(&mut cuda_ctx)).unwrap();
+                    }
+                    Duration::from_nanos(timer.load(Ordering::Relaxed))
+                });
+            },
+        );
+    }
+
+    group.finish();
+}
+
+criterion::criterion_group! {
+    name = benches;
+    config = bench_config::cuda_bench_config();
+    targets = benchmark_fsst_cuda_decompress
+}
+
+#[cuda_available]
+criterion::criterion_main!(benches);
+
+#[cuda_not_available]
+fn main() {}
@@ -0,0 +1,190 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+#include "config.cuh"
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <stdint.h>
+
+// FSST decompression. A thread decodes one string at a time.
+//
+// Per-thread scratch holds 24 bytes across three u64 registers. Byte i
+// lives at bit (8 * (i mod 8)) of:
+//   scratch_low  for i in 0..8
+//   scratch_mid  for i in 8..16
+//   scratch_high for i in 16..24
+//
+//                  lsb                                 msb
+//   scratch_low:  [ b0 | b1 | b2 | b3 | b4 | b5 | b6 | b7 ]
+//   scratch_mid:  [ b8 | b9 |b10 |b11 |b12 |b13 |b14 |b15 ]
+//   scratch_high: [b16 |b17 |b18 |b19 |b20 |b21 |b22 |b23 ]
+//
+// `drain_step` picks the largest aligned store the gates allow (alignment of
+// out_pos, scratch occupancy, remaining out_end room). Bytes leave from the
+// low end (scratch_low byte 0); the kept bytes slide N positions toward that
+// low end across all three registers i.e. each u64 right-shifts by N*8 and
+// pulls the next register's low bits up to fill the vacated high bits.
+//
+//   width   gate                                          ptx
+//   ------  -------------------------------------------   ----------------
+//   16 B    out_pos % 16 == 0, scratch ≥ 16, room ≥ 16    st.global.v2.u64
+//    8 B    out_pos %  8 == 0, scratch ≥  8, room ≥  8    st.global.u64
+//    4 B    out_pos %  4 == 0, scratch ≥  4, room ≥  4    st.global.u32
+//    2 B    out_pos %  2 == 0, scratch ≥  2, room ≥  2    st.global.u16
+//    1 B    (always)                                      st.global.u8
+//
+// The narrow widths cover the prologue alignment-up (out_pos not yet
+// 16-aligned) and the epilogue tail (< 16 bytes left, no room for u128).
+// In steady state out_pos stays 16-aligned and u128 fires repeatedly.
+//
+// The 256-entry symbol table (≤ 2 KB) is read directly from global memory.
+// Staging it into shared memory measured ~3% slower at 10M rows and ~15%
+// slower at 1M rows (benchmarked on clickbench URLs). The hypothesis is that L1
+// already holds the table after a few iterations and the explicit shared copy
+// adds bank-conflict latency on the warp-divergent `symbols[code]` reads; the
+// gap is wider at 1M because the kernel is less bandwidth-bound there, so
+// per-load latency shows up more.
+//
+// Decoded symbols are masked to their valid byte length so the table's high
+// bits never leak. The main loop drains to `scratch_bytes < 16`, keeping the
+// next add (≤ 8 bytes) within the 24-byte capacity.
+//
+// `codes_offsets` is templated over the four unsigned integer widths
+// (u8/u16/u32/u64). `output_offsets` is always uint32_t because of the
+// MAX_BUFFER_LEN output limit.
+
+template <typename OffT>
+__device__ inline void fsst_decode_string(const uint8_t *__restrict codes_bytes,
+                                          const OffT *__restrict codes_offsets,
+                                          const uint64_t *__restrict symbols,
+                                          const uint8_t *__restrict symbol_lengths,
+                                          const uint32_t *__restrict output_offsets,
+                                          uint8_t *__restrict output_bytes,
+                                          uint64_t sid) {
+    OffT in_pos = codes_offsets[sid];
+    const OffT in_end = codes_offsets[sid + 1];
+    uint32_t out_pos = output_offsets[sid];
+    const uint32_t out_end = output_offsets[sid + 1];
+
+    uint64_t scratch_low = 0, scratch_mid = 0, scratch_high = 0;
+    uint32_t scratch_bytes = 0;
+
+    // Emit one drain step: pick the largest aligned store the gates allow,
+    // write it, and slide the kept bytes toward the low end (each register
+    // right-shifts and pulls bits from the register one position above).
+    auto drain_step = [&] {
+        if (scratch_bytes >= 16 && (out_pos & 15u) == 0 && out_pos + 16 <= out_end) {
+            *reinterpret_cast<ulonglong2 *>(output_bytes + out_pos) =
+                make_ulonglong2(scratch_low, scratch_mid);
+            scratch_low = scratch_high;
+            scratch_mid = 0;
+            scratch_high = 0;
+            out_pos += 16;
+            scratch_bytes -= 16;
+        } else if (scratch_bytes >= 8 && (out_pos & 7u) == 0 && out_pos + 8 <= out_end) {
+            *reinterpret_cast<uint64_t *>(output_bytes + out_pos) = scratch_low;
+            scratch_low = scratch_mid;
+            scratch_mid = scratch_high;
+            scratch_high = 0;
+            out_pos += 8;
+            scratch_bytes -= 8;
+        } else if (scratch_bytes >= 4 && (out_pos & 3u) == 0 && out_pos + 4 <= out_end) {
+            *reinterpret_cast<uint32_t *>(output_bytes + out_pos) = (uint32_t)scratch_low;
+            scratch_low = (scratch_low >> 32) | (scratch_mid << 32);
+            scratch_mid = (scratch_mid >> 32) | (scratch_high << 32);
+            scratch_high >>= 32;
+            out_pos += 4;
+            scratch_bytes -= 4;
+        } else if (scratch_bytes >= 2 && (out_pos & 1u) == 0 && out_pos + 2 <= out_end) {
+            *reinterpret_cast<uint16_t *>(output_bytes + out_pos) = (uint16_t)scratch_low;
+            scratch_low = (scratch_low >> 16) | (scratch_mid << 48);
+            scratch_mid = (scratch_mid >> 16) | (scratch_high << 48);
+            scratch_high >>= 16;
+            out_pos += 2;
+            scratch_bytes -= 2;
+        } else {
+            output_bytes[out_pos] = (uint8_t)scratch_low;
+            scratch_low = (scratch_low >> 8) | (scratch_mid << 56);
+            scratch_mid = (scratch_mid >> 8) | (scratch_high << 56);
+            scratch_high >>= 8;
+            out_pos += 1;
+            scratch_bytes -= 1;
+        }
+    };
+
+    while (in_pos < in_end) {
+        // Drain to scratch_bytes < 16 so the next ≤8-byte symbol fits.
+        while (scratch_bytes >= 16) {
+            drain_step();
+        }
+
+        // Decode next code. 255 is the escape for raw literal bytes.
+        const uint8_t code = codes_bytes[in_pos];
+        uint64_t sym;
+        uint32_t len, consumed;
+        if (code == 255) {
+            sym = (uint64_t)codes_bytes[in_pos + 1];
+            len = 1;
+            consumed = 2;
+        } else {
+            sym = symbols[code];
+            len = symbol_lengths[code];
+            consumed = 1;
+        }
+
+        // Zero out the symbol's high bytes beyond its valid length.
+        const uint64_t mask = (len == 8) ? ~0ULL : ((1ULL << (8u * len)) - 1ULL);
+        sym &= mask;
+
+        // Insert at byte offset scratch_bytes; the symbol can span at most
+        // two of the three scratch segments.
+        if (scratch_bytes < 8) {
+            scratch_low |= sym << (8u * scratch_bytes);
+            if (scratch_bytes + len > 8) {
+                scratch_mid |= sym >> (8u * (8u - scratch_bytes));
+            }
+        } else {
+            scratch_mid |= sym << (8u * (scratch_bytes - 8u));
+            if (scratch_bytes + len > 16) {
+                scratch_high |= sym >> (8u * (16u - scratch_bytes));
+            }
+        }
+        scratch_bytes += len;
+        in_pos += (OffT)consumed;
+    }
+
+    // Epilogue: drain everything that's left.
+    while (scratch_bytes > 0) {
+        drain_step();
+    }
+}
+
+#define GENERATE_FSST_KERNEL(suffix, OffT)                                                                   \
+    extern "C" __global__ void fsst_##suffix(const uint8_t *__restrict codes_bytes,                          \
+                                             const OffT *__restrict codes_offsets,                           \
+                                             const uint64_t *__restrict symbols,                             \
+                                             const uint8_t *__restrict symbol_lengths,                       \
+                                             const uint32_t *__restrict output_offsets,                      \
+                                             uint8_t *__restrict output_bytes,                               \
+                                             uint64_t num_strings) {                                         \
+        const uint64_t elements_per_block = (uint64_t)blockDim.x * ELEMENTS_PER_THREAD;                      \
+        const uint64_t block_start = (uint64_t)blockIdx.x * elements_per_block;                              \
+        const uint64_t block_end = (block_start + elements_per_block < num_strings)                          \
+                                       ? (block_start + elements_per_block)                                  \
+                                       : num_strings;                                                        \
+                                                                                                             \
+        for (uint64_t sid = block_start + threadIdx.x; sid < block_end; sid += blockDim.x) {                 \
+            fsst_decode_string<OffT>(codes_bytes,                                                            \
+                                     codes_offsets,                                                          \
+                                     symbols,                                                                \
+                                     symbol_lengths,                                                         \
+                                     output_offsets,                                                         \
+                                     output_bytes,                                                           \
+                                     sid);                                                                   \
+        }                                                                                                    \
+    }
+
+GENERATE_FSST_KERNEL(u8, uint8_t)
+GENERATE_FSST_KERNEL(u16, uint16_t)
+GENERATE_FSST_KERNEL(u32, uint32_t)
+GENERATE_FSST_KERNEL(u64, uint64_t)