diff --git a/Cargo.lock b/Cargo.lock
index 0f7a33f3d14..cc95b511b7f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -10334,6 +10334,7 @@ dependencies = [
  "vortex-cuda",
  "vortex-cuda-macros",
  "vortex-error",
+ "vortex-fsst",
  "vortex-nvcomp",
 ]
 
diff --git a/vortex-cuda/Cargo.toml b/vortex-cuda/Cargo.toml
index a58c1c379ce..6b6764c59cf 100644
--- a/vortex-cuda/Cargo.toml
+++ b/vortex-cuda/Cargo.toml
@@ -50,6 +50,7 @@ rstest = { workspace = true }
 tokio = { workspace = true, features = ["rt", "macros"] }
 vortex-array = { workspace = true, features = ["_test-harness"] }
 vortex-cuda = { path = ".", features = ["_test-harness"] }
+vortex-fsst = { workspace = true, features = ["_test-harness"] }
 
 [build-dependencies]
 bindgen = { workspace = true }
@@ -94,3 +95,7 @@ harness = false
 [[bench]]
 name = "throughput_cuda"
 harness = false
+
+[[bench]]
+name = "fsst_cuda"
+harness = false
diff --git a/vortex-cuda/benches/fsst_cuda.rs b/vortex-cuda/benches/fsst_cuda.rs
new file mode 100644
index 00000000000..96e70e02e02
--- /dev/null
+++ b/vortex-cuda/benches/fsst_cuda.rs
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! CUDA benchmarks for FSST decompression.
+
+#![expect(clippy::unwrap_used)]
+#![expect(clippy::cast_possible_truncation)]
+
+#[allow(dead_code)]
+mod bench_config;
+mod timed_launch_strategy;
+
+use std::sync::Arc;
+use std::sync::atomic::Ordering;
+use std::time::Duration;
+
+use criterion::BenchmarkId;
+use criterion::Criterion;
+use criterion::Throughput;
+use futures::executor::block_on;
+use vortex::array::IntoArray;
+use vortex::array::arrays::PrimitiveArray;
+use vortex::array::match_each_integer_ptype;
+use vortex::encodings::fsst::FSSTArrayExt;
+use vortex::error::VortexExpect;
+use vortex::session::VortexSession;
+use vortex_cuda::CudaSession;
+use vortex_cuda::executor::CudaArrayExt;
+use vortex_cuda_macros::cuda_available;
+use vortex_cuda_macros::cuda_not_available;
+use vortex_fsst::test_utils::make_fsst_clickbench_urls;
+
+use crate::timed_launch_strategy::TimedLaunchStrategy;
+
+const BENCH_SIZES: &[(usize, &str)] = &[(1_000_000, "1M"), (5_000_000, "5M"), (10_000_000, "10M")];
+
+fn benchmark_fsst_cuda_decompress(c: &mut Criterion) {
+    let mut group = c.benchmark_group("cuda");
+
+    for &(n, len_str) in BENCH_SIZES {
+        let mut setup_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
+            .vortex_expect("failed to create execution context");
+        let fsst = make_fsst_clickbench_urls(n, setup_ctx.execution_ctx());
+
+        let lens = fsst
+            .uncompressed_lengths()
+            .clone()
+            .execute::<PrimitiveArray>(setup_ctx.execution_ctx())
+            .vortex_expect("canonicalize uncompressed_lengths");
+        let total_size: usize = match_each_integer_ptype!(lens.ptype(), |P| {
+            lens.as_slice::<P>().iter().map(|x| *x as usize).sum()
+        });
+        let uncompressed_size = total_size as u64;
+
+        let fsst_array = fsst.into_array();
+
+        group.throughput(Throughput::Bytes(uncompressed_size));
+        group.bench_with_input(
+            BenchmarkId::new("cuda/fsst/decompress", len_str),
+            &fsst_array,
+            |b, fsst_array| {
+                b.iter_custom(|iters| {
+                    let timed = TimedLaunchStrategy::default();
+                    let timer = timed.timer();
+
+                    let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
+                        .vortex_expect("failed to create execution context")
+                        .with_launch_strategy(Arc::new(timed));
+
+                    for _ in 0..iters {
+                        block_on(fsst_array.clone().execute_cuda(&mut cuda_ctx)).unwrap();
+                    }
+                    Duration::from_nanos(timer.load(Ordering::Relaxed))
+                });
+            },
+        );
+    }
+
+    group.finish();
+}
+
+criterion::criterion_group! {
+    name = benches;
+    config = bench_config::cuda_bench_config();
+    targets = benchmark_fsst_cuda_decompress
+}
+
+#[cuda_available]
+criterion::criterion_main!(benches);
+
+#[cuda_not_available]
+fn main() {}
diff --git a/vortex-cuda/kernels/src/fsst.cu b/vortex-cuda/kernels/src/fsst.cu
new file mode 100644
index 00000000000..938f262c3da
--- /dev/null
+++ b/vortex-cuda/kernels/src/fsst.cu
@@ -0,0 +1,226 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+#include "config.cuh"
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <stdint.h>
+
+// FSST decompression. A thread decodes one string at a time.
+//
+// Per-thread `Scratch` holds 24 bytes across three u64 lanes (`low`, `mid`,
+// `high`) plus a `cursor` byte counter. Byte i lives at bit (8 * (i mod 8))
+// of:
+//   low   for i in 0..8
+//   mid   for i in 8..16
+//   high  for i in 16..24
+//
+//          lsb                                 msb
+//   low:  [ b0 | b1 | b2 | b3 | b4 | b5 | b6 | b7 ]
+//   mid:  [ b8 | b9 |b10 |b11 |b12 |b13 |b14 |b15 ]
+//   high: [b16 |b17 |b18 |b19 |b20 |b21 |b22 |b23 ]
+//
+// `Scratch::drain` picks the largest aligned store the gates allow
+// (alignment of out_pos, cursor, remaining out_end room). Bytes leave from
+// the low end (`low` byte 0); the kept bytes slide N positions toward that
+// low end across all three lanes i.e. each u64 right-shifts by N*8 and
+// pulls the next lane's low bits up to fill the vacated high bits.
+// `Scratch::push` inserts a length-`len` masked symbol at byte offset
+// `cursor`, spanning at most two of the three lanes.
+//
+//   width   gate                                         ptx
+//   ------  ------------------------------------------   ----------------
+//   16 B    out_pos % 16 == 0, cursor ≥ 16, room ≥ 16    st.global.v2.u64
+//    8 B    out_pos %  8 == 0, cursor ≥  8, room ≥  8    st.global.u64
+//    4 B    out_pos %  4 == 0, cursor ≥  4, room ≥  4    st.global.u32
+//    2 B    out_pos %  2 == 0, cursor ≥  2, room ≥  2    st.global.u16
+//    1 B    (always)                                     st.global.u8
+//
+// The narrow widths cover the prologue alignment-up (out_pos not yet
+// 16-aligned) and the epilogue tail (< 16 bytes left, no room for u128).
+// In steady state out_pos stays 16-aligned and u128 fires repeatedly.
+//
+// The 256-entry symbol table (≤ 2 KB) is read directly from global memory.
+// Staging it into shared memory measured ~3% slower at 10M rows and ~15%
+// slower at 1M rows (benchmarked on clickbench URLs). The hypothesis is that L1
+// already holds the table after a few iterations and the explicit shared copy
+// adds bank-conflict latency on the warp-divergent `symbols[code]` reads; the
+// gap is wider at 1M because the kernel is less bandwidth-bound there, so
+// per-load latency shows up more.
+//
+// Decoded symbols are masked to their valid byte length so the table's high
+// bits never leak. The main loop drains to `scratch.cursor ≤ 16`, keeping
+// the next add (≤ 8 bytes) within the 24-byte capacity.
+//
+// `codes_offsets` is templated over the four unsigned integer widths
+// (u8/u16/u32/u64). `output_offsets` is uint64_t.
+
+// 24-byte scratch buffer split across three u64 lanes. `cursor` is the
+// number of bytes currently buffered and the next-push offset.
+struct Scratch {
+    uint64_t low = 0;
+    uint64_t mid = 0;
+    uint64_t high = 0;
+    uint32_t cursor = 0;
+
+    // Insert a length-`len` masked symbol at byte offset `cursor`. The
+    // symbol spans at most two of the three lanes. Caller must ensure
+    // cursor + len ≤ 24.
+    __device__ inline void push(uint64_t sym, uint32_t len) {
+        if (cursor < 8) {
+            low |= sym << (8u * cursor);
+            if (cursor + len > 8) {
+                mid |= sym >> (8u * (8u - cursor));
+            }
+        } else {
+            mid |= sym << (8u * (cursor - 8u));
+            if (cursor + len > 16) {
+                high |= sym >> (8u * (16u - cursor));
+            }
+        }
+        cursor += len;
+    }
+
+    // Emit one variable-width aligned store from the low end and slide the
+    // kept bytes toward the low end across all three lanes.
+    __device__ inline void drain(uint8_t *__restrict out, uint64_t &out_pos, uint64_t out_end) {
+        if (cursor >= 16 && (out_pos & 15u) == 0 && out_pos + 16 <= out_end) {
+            *reinterpret_cast<ulonglong2 *>(out + out_pos) = make_ulonglong2(low, mid);
+            low = high;
+            mid = 0;
+            high = 0;
+            out_pos += 16;
+            cursor -= 16;
+        } else if (cursor >= 8 && (out_pos & 7u) == 0 && out_pos + 8 <= out_end) {
+            *reinterpret_cast<uint64_t *>(out + out_pos) = low;
+            low = mid;
+            mid = high;
+            high = 0;
+            out_pos += 8;
+            cursor -= 8;
+        } else if (cursor >= 4 && (out_pos & 3u) == 0 && out_pos + 4 <= out_end) {
+            *reinterpret_cast<uint32_t *>(out + out_pos) = (uint32_t)low;
+            low = (low >> 32) | (mid << 32);
+            mid = (mid >> 32) | (high << 32);
+            high >>= 32;
+            out_pos += 4;
+            cursor -= 4;
+        } else if (cursor >= 2 && (out_pos & 1u) == 0 && out_pos + 2 <= out_end) {
+            *reinterpret_cast<uint16_t *>(out + out_pos) = (uint16_t)low;
+            low = (low >> 16) | (mid << 48);
+            mid = (mid >> 16) | (high << 48);
+            high >>= 16;
+            out_pos += 2;
+            cursor -= 2;
+        } else {
+            out[out_pos] = (uint8_t)low;
+            low = (low >> 8) | (mid << 56);
+            mid = (mid >> 8) | (high << 56);
+            high >>= 8;
+            out_pos += 1;
+            cursor -= 1;
+        }
+    }
+};
+
+template <typename OffT>
+struct FSSTArgs {
+    // Compressed FSST code stream, contiguous across all strings. String
+    // `sid`'s codes live in `[codes_offsets[sid], codes_offsets[sid + 1])`.
+    const uint8_t *__restrict codes_bytes;
+    // Per-string offsets into `codes_bytes`, length `num_strings + 1`.
+    const OffT *__restrict codes_offsets;
+    // FSST symbol table.
+    const uint64_t *__restrict symbols;
+    // Length in bytes (1..=8) of each entry in `symbols`. The remaining bits
+    // are unspecified.
+    const uint8_t *__restrict symbol_lengths;
+    // Buffer to write decoded data into.
+    uint8_t *__restrict output_bytes;
+    // Per-string offsets into `output_bytes`, length `num_strings + 1`.
+    const uint64_t *__restrict output_offsets;
+    // Validity of each string.
+    const uint8_t *__restrict validity_bits;
+};
+
+template <typename OffT>
+__device__ inline void fsst_decode_string(const FSSTArgs<OffT> &args, uint64_t sid) {
+    if (((args.validity_bits[sid >> 3] >> (sid & 7u)) & 1u) == 0u) {
+        return;
+    }
+
+    OffT in_pos = args.codes_offsets[sid];
+    const OffT in_end = args.codes_offsets[sid + 1];
+    uint64_t out_pos = args.output_offsets[sid];
+    const uint64_t out_end = args.output_offsets[sid + 1];
+
+    Scratch scratch;
+
+    while (in_pos < in_end) {
+        // Drain to scratch.cursor ≤ 16 so the next ≤8-byte symbol fits in 24.
+        while (scratch.cursor > 16) {
+            scratch.drain(args.output_bytes, out_pos, out_end);
+        }
+
+        // Decode next code. 255 is the escape for raw literal bytes.
+        const uint8_t code = args.codes_bytes[in_pos];
+        uint64_t sym;
+        uint32_t len, consumed;
+        if (code == 255) {
+            sym = (uint64_t)args.codes_bytes[in_pos + 1];
+            len = 1;
+            consumed = 2;
+        } else {
+            sym = args.symbols[code];
+            len = args.symbol_lengths[code];
+            consumed = 1;
+        }
+
+        // Zero out the symbol's high bytes beyond its valid length.
+        const uint64_t mask = (len == 8) ? ~0ULL : ((1ULL << (8u * len)) - 1ULL);
+        sym &= mask;
+
+        scratch.push(sym, len);
+        in_pos += (OffT)consumed;
+    }
+
+    // Epilogue: drain everything that's left.
+    while (scratch.cursor > 0) {
+        scratch.drain(args.output_bytes, out_pos, out_end);
+    }
+}
+
+#define GENERATE_FSST_KERNEL(suffix, OffT)                                                                   \
+    extern "C" __global__ void fsst_##suffix(const uint8_t *__restrict codes_bytes,                          \
+                                             const OffT *__restrict codes_offsets,                           \
+                                             const uint64_t *__restrict symbols,                             \
+                                             const uint8_t *__restrict symbol_lengths,                       \
+                                             const uint64_t *__restrict output_offsets,                      \
+                                             const uint8_t *__restrict validity_bits,                        \
+                                             uint8_t *__restrict output_bytes,                               \
+                                             uint64_t num_strings) {                                         \
+        const FSSTArgs<OffT> args = {                                                                        \
+            codes_bytes,                                                                                     \
+            codes_offsets,                                                                                   \
+            symbols,                                                                                         \
+            symbol_lengths,                                                                                  \
+            output_bytes,                                                                                    \
+            output_offsets,                                                                                  \
+            validity_bits,                                                                                   \
+        };                                                                                                   \
+                                                                                                             \
+        const uint64_t elements_per_block = (uint64_t)blockDim.x * ELEMENTS_PER_THREAD;                      \
+        const uint64_t block_start = (uint64_t)blockIdx.x * elements_per_block;                              \
+        const uint64_t block_end = (block_start + elements_per_block < num_strings)                          \
+                                       ? (block_start + elements_per_block)                                  \
+                                       : num_strings;                                                        \
+                                                                                                             \
+        for (uint64_t sid = block_start + threadIdx.x; sid < block_end; sid += blockDim.x) {                 \
+            fsst_decode_string<OffT>(args, sid);                                                             \
+        }                                                                                                    \
+    }
+
+GENERATE_FSST_KERNEL(u8, uint8_t)
+GENERATE_FSST_KERNEL(u16, uint16_t)
+GENERATE_FSST_KERNEL(u32, uint32_t)
+GENERATE_FSST_KERNEL(u64, uint64_t)
diff --git a/vortex-cuda/src/kernel/encodings/fsst.rs b/vortex-cuda/src/kernel/encodings/fsst.rs
new file mode 100644
index 00000000000..5d3d66eaf04
--- /dev/null
+++ b/vortex-cuda/src/kernel/encodings/fsst.rs
@@ -0,0 +1,285 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! CUDA executor for FSST decompression.
+
+use std::fmt::Debug;
+use std::sync::Arc;
+
+use async_trait::async_trait;
+use cudarc::driver::DevicePtr;
+use cudarc::driver::DeviceRepr;
+use cudarc::driver::PushKernelArg;
+use tracing::instrument;
+use vortex::array::ArrayRef;
+use vortex::array::Canonical;
+use vortex::array::arrays::PrimitiveArray;
+use vortex::array::arrays::VarBinViewArray;
+use vortex::array::arrays::primitive::PrimitiveDataParts;
+use vortex::array::arrays::varbin::VarBinArrayExt;
+use vortex::array::arrays::varbinview::BinaryView;
+use vortex::array::arrays::varbinview::build_views::MAX_BUFFER_LEN;
+use vortex::array::arrays::varbinview::build_views::build_views;
+use vortex::array::buffer::DeviceBuffer;
+use vortex::array::match_each_integer_ptype;
+use vortex::array::match_each_unsigned_integer_ptype;
+use vortex::array::validity::Validity;
+use vortex::buffer::Alignment;
+use vortex::buffer::Buffer;
+use vortex::dtype::NativePType;
+use vortex::encodings::fsst::FSST;
+use vortex::encodings::fsst::FSSTArray;
+use vortex::encodings::fsst::FSSTArrayExt;
+use vortex::error::VortexExpect;
+use vortex::error::VortexResult;
+use vortex::error::vortex_err;
+
+use crate::CudaBufferExt;
+use crate::CudaDeviceBuffer;
+use crate::executor::CudaExecute;
+use crate::executor::CudaExecutionCtx;
+
+/// CUDA decoder for FSST.
+#[derive(Debug)]
+pub(crate) struct FSSTExecutor;
+
+impl FSSTExecutor {
+    fn try_specialize(array: ArrayRef) -> Option<FSSTArray> {
+        array.try_downcast::<FSST>().ok()
+    }
+}
+
+#[async_trait]
+impl CudaExecute for FSSTExecutor {
+    #[instrument(level = "trace", skip_all, fields(executor = ?self))]
+    async fn execute(
+        &self,
+        array: ArrayRef,
+        ctx: &mut CudaExecutionCtx,
+    ) -> VortexResult<Canonical> {
+        let fsst = Self::try_specialize(array).ok_or_else(|| vortex_err!("Expected FSSTArray"))?;
+
+        let dtype = fsst.dtype().clone();
+        let validity = fsst.codes().validity()?;
+
+        if fsst.is_empty() || matches!(validity, Validity::AllInvalid) {
+            let empty = unsafe {
+                VarBinViewArray::new_unchecked(
+                    Buffer::<BinaryView>::zeroed(fsst.len()),
+                    Arc::from([]),
+                    dtype,
+                    validity,
+                )
+            };
+            return Ok(Canonical::VarBinView(empty));
+        }
+
+        let lens = fsst
+            .uncompressed_lengths()
+            .clone()
+            .execute::<PrimitiveArray>(ctx.execution_ctx())?;
+        let codes_offsets = fsst
+            .codes()
+            .offsets()
+            .clone()
+            .execute::<PrimitiveArray>(ctx.execution_ctx())?;
+
+        // Prefix-sum lens to per-string u64 output offsets so the kernel
+        // knows where to write each decoded string.
+        let output_offsets: Vec<u64> = match_each_integer_ptype!(lens.ptype(), |P| {
+            let mut out = Vec::with_capacity(lens.len() + 1);
+            let mut acc: u64 = 0;
+            out.push(0u64);
+            #[allow(clippy::unnecessary_cast)]
+            for &l in lens.as_slice::<P>() {
+                acc += l as u64;
+                out.push(acc);
+            }
+            out
+        });
+
+        // Dispatch on the unsigned width; signed and unsigned offsets of the
+        // same width share an identical byte representation.
+        match_each_unsigned_integer_ptype!(codes_offsets.ptype().to_unsigned(), |U| {
+            decode_fsst::<U>(fsst, codes_offsets, lens, output_offsets, ctx).await
+        })
+    }
+}
+
+async fn decode_fsst<U>(
+    fsst: FSSTArray,
+    codes_offsets: PrimitiveArray,
+    lens: PrimitiveArray,
+    output_offsets: Vec<u64>,
+    ctx: &mut CudaExecutionCtx,
+) -> VortexResult<Canonical>
+where
+    U: NativePType + DeviceRepr + Send + Sync + 'static,
+{
+    let dtype = fsst.dtype().clone();
+    let validity = fsst.codes().validity()?;
+    let num_strings = fsst.len();
+    let num_strings_u64 = num_strings as u64;
+    let total_size = usize::try_from(
+        *output_offsets
+            .last()
+            .vortex_expect("output_offsets has at least one entry"),
+    )
+    .vortex_expect("total_size fits in usize");
+
+    let symbols_u64: Vec<u64> = fsst.symbols().iter().map(|s| s.to_u64()).collect();
+    let symbol_lengths = fsst.symbol_lengths().clone();
+    let codes_bytes_handle = fsst.codes_bytes_handle().clone();
+    let PrimitiveDataParts {
+        buffer: codes_offsets_buffer,
+        ..
+    } = codes_offsets.into_data_parts();
+
+    let (.., validity_bits) = validity
+        .clone()
+        .execute_mask(num_strings, ctx.execution_ctx())?
+        .into_bit_buffer()
+        .sliced()
+        .into_inner();
+
+    let (symbols, symbol_lengths, output_offsets, validity_device, codes_bytes, codes_offsets) = futures::try_join!(
+        ctx.copy_to_device(symbols_u64)?,
+        ctx.copy_to_device(symbol_lengths)?,
+        ctx.copy_to_device(output_offsets)?,
+        ctx.copy_to_device(validity_bits.to_vec())?,
+        ctx.ensure_on_device(codes_bytes_handle),
+        ctx.ensure_on_device(codes_offsets_buffer),
+    )?;
+
+    // The kernel checks store alignment relative to the base via
+    // `out_pos % N`, so the base must satisfy the widest store (u128 → 16).
+    let device_output = ctx.device_alloc::<u8>(total_size)?;
+    let (output_base_ptr, _) = device_output.device_ptr(ctx.stream());
+    assert_eq!(
+        output_base_ptr % 16,
+        0,
+        "device_output base not 16-aligned: {output_base_ptr:#x}",
+    );
+
+    let codes_bytes_view = codes_bytes.cuda_view::<u8>()?;
+    let codes_offsets_view = codes_offsets.cuda_view::<U>()?;
+    let symbols_view = symbols.cuda_view::<u64>()?;
+    let symbol_lengths_view = symbol_lengths.cuda_view::<u8>()?;
+    let output_offsets_view = output_offsets.cuda_view::<u64>()?;
+    let validity_view = validity_device.cuda_view::<u8>()?;
+
+    let cuda_function = ctx.load_function("fsst", &[U::PTYPE])?;
+    ctx.launch_kernel(&cuda_function, num_strings, |args| {
+        args.arg(&codes_bytes_view)
+            .arg(&codes_offsets_view)
+            .arg(&symbols_view)
+            .arg(&symbol_lengths_view)
+            .arg(&output_offsets_view)
+            .arg(&validity_view)
+            .arg(&device_output)
+            .arg(&num_strings_u64);
+    })?;
+
+    let host_bytes = CudaDeviceBuffer::new(device_output)
+        .copy_to_host(Alignment::new(1))?
+        .await?;
+    let host_bytes = host_bytes.slice(0..total_size);
+
+    let (buffers, views) = match_each_integer_ptype!(lens.ptype(), |P| {
+        build_views(
+            0,
+            MAX_BUFFER_LEN,
+            host_bytes.into_mut(),
+            lens.as_slice::<P>(),
+        )
+    });
+
+    Ok(Canonical::VarBinView(unsafe {
+        VarBinViewArray::new_unchecked(views, Arc::from(buffers), dtype, validity)
+    }))
+}
+
+#[cfg(test)]
+mod tests {
+    use rstest::rstest;
+    use vortex::array::IntoArray;
+    use vortex::array::arrays::VarBinArray;
+    use vortex::array::assert_arrays_eq;
+    use vortex::dtype::DType;
+    use vortex::dtype::Nullability;
+    use vortex::encodings::fsst::fsst_compress;
+    use vortex::encodings::fsst::fsst_train_compressor;
+    use vortex::error::VortexExpect;
+    use vortex::session::VortexSession;
+
+    use super::*;
+    use crate::CanonicalCudaExt;
+    use crate::session::CudaSession;
+
+    #[rstest]
+    #[case::non_null(
+        vec![Some(&b"the quick brown fox"[..]),
+             Some(&b"jumps over the lazy dog"[..]),
+             Some(&b"hello world"[..]),
+             Some(&b"vortex fsst test string"[..])],
+        Nullability::NonNullable,
+    )]
+    #[case::partial_nulls(
+        vec![Some(&b"alpha"[..]), None, Some(&b"gamma"[..]), None, Some(&b"epsilon"[..])],
+        Nullability::Nullable,
+    )]
+    #[case::all_nulls(
+        vec![None, None, None, None, None],
+        Nullability::Nullable,
+    )]
+    #[crate::test]
+    async fn test_cuda_fsst_decompression_roundtrip(
+        #[case] strings: Vec<Option<&'static [u8]>>,
+        #[case] nullability: Nullability,
+    ) -> VortexResult<()> {
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
+            .vortex_expect("failed to create execution context");
+
+        let varbin = VarBinArray::from_iter(strings, DType::Binary(nullability));
+        let compressor = fsst_train_compressor(&varbin);
+        let dtype = varbin.dtype().clone();
+        let len = varbin.len();
+        let fsst_array =
+            fsst_compress(&varbin, len, &dtype, &compressor, cuda_ctx.execution_ctx()).into_array();
+
+        let cpu_result = crate::canonicalize_cpu(fsst_array.clone())?;
+        let gpu_result = FSSTExecutor
+            .execute(fsst_array, &mut cuda_ctx)
+            .await
+            .vortex_expect("GPU decompression failed")
+            .into_host()
+            .await?
+            .into_array();
+
+        assert_arrays_eq!(cpu_result.into_array(), gpu_result);
+        Ok(())
+    }
+
+    /// Exercises the multi-block grid-stride path on a larger dataset.
+    #[crate::test]
+    async fn test_cuda_fsst_decompression_roundtrip_large() -> VortexResult<()> {
+        use vortex_fsst::test_utils::make_fsst_clickbench_urls;
+
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
+            .vortex_expect("failed to create execution context");
+
+        let fsst_array = make_fsst_clickbench_urls(100_000, cuda_ctx.execution_ctx()).into_array();
+
+        let cpu_result = crate::canonicalize_cpu(fsst_array.clone())?;
+        let gpu_result = FSSTExecutor
+            .execute(fsst_array, &mut cuda_ctx)
+            .await
+            .vortex_expect("GPU decompression failed")
+            .into_host()
+            .await?
+            .into_array();
+
+        assert_arrays_eq!(cpu_result.into_array(), gpu_result);
+        Ok(())
+    }
+}
diff --git a/vortex-cuda/src/kernel/encodings/mod.rs b/vortex-cuda/src/kernel/encodings/mod.rs
index 62a8d9f606d..1571762fc7b 100644
--- a/vortex-cuda/src/kernel/encodings/mod.rs
+++ b/vortex-cuda/src/kernel/encodings/mod.rs
@@ -6,6 +6,7 @@ mod bitpacked;
 mod date_time_parts;
 mod decimal_byte_parts;
 mod for_;
+mod fsst;
 mod runend;
 mod sequence;
 mod zigzag;
@@ -18,6 +19,7 @@ pub(crate) use bitpacked::BitPackedExecutor;
 pub(crate) use date_time_parts::DateTimePartsExecutor;
 pub(crate) use decimal_byte_parts::DecimalBytePartsExecutor;
 pub(crate) use for_::FoRExecutor;
+pub(crate) use fsst::FSSTExecutor;
 pub(crate) use runend::RunEndExecutor;
 pub(crate) use sequence::SequenceExecutor;
 pub(crate) use zigzag::ZigZagExecutor;
diff --git a/vortex-cuda/src/lib.rs b/vortex-cuda/src/lib.rs
index bbebef9430d..5692bb6affc 100644
--- a/vortex-cuda/src/lib.rs
+++ b/vortex-cuda/src/lib.rs
@@ -37,6 +37,7 @@ use kernel::DateTimePartsExecutor;
 use kernel::DecimalBytePartsExecutor;
 pub use kernel::DefaultLaunchStrategy;
 use kernel::DictExecutor;
+use kernel::FSSTExecutor;
 use kernel::FilterExecutor;
 use kernel::FoRExecutor;
 pub use kernel::LaunchStrategy;
@@ -70,6 +71,7 @@ use vortex::encodings::datetime_parts::DateTimeParts;
 use vortex::encodings::decimal_byte_parts::DecimalByteParts;
 use vortex::encodings::fastlanes::BitPacked;
 use vortex::encodings::fastlanes::FoR;
+use vortex::encodings::fsst::FSST;
 use vortex::encodings::runend::RunEnd;
 use vortex::encodings::sequence::Sequence;
 use vortex::encodings::zigzag::ZigZag;
@@ -114,6 +116,7 @@ pub fn initialize_cuda(session: &CudaSession) {
     session.register_kernel(Dict.id(), &DictExecutor);
     session.register_kernel(Shared.id(), &SharedExecutor);
     session.register_kernel(FoR.id(), &FoRExecutor);
+    session.register_kernel(FSST.id(), &FSSTExecutor);
     session.register_kernel(RunEnd.id(), &RunEndExecutor);
     session.register_kernel(Sequence.id(), &SequenceExecutor);
     session.register_kernel(ZigZag.id(), &ZigZagExecutor);