Add KvikIO GDS (GPUDirect Storage) path for GPU reads

brendancol · brendancol · commit 1553d03e5828 · 2026-03-20T10:45:49.000-07:00
When kvikio is installed, read_geotiff_gpu() can read compressed tile
bytes directly from NVMe SSD to GPU VRAM via GPUDirect Storage,
bypassing CPU memory entirely:

  Normal:   SSD -&gt; CPU (mmap) -&gt; cupy.asarray (CPU-&gt;GPU copy)
  With GDS: SSD -&gt; GPU VRAM (direct DMA, no CPU involved)

The full pipeline for a ZSTD COG with GDS + nvCOMP:
  SSD --(GDS)--&gt; GPU compressed tiles --(nvCOMP)--&gt; GPU decompressed
  --&gt; GPU predictor decode --&gt; GPU tile assembly --&gt; CuPy DataArray

Fallback chain in read_geotiff_gpu:
1. KvikIO GDS file read + nvCOMP batch decompress (fastest)
2. CPU mmap tile extract + nvCOMP batch decompress
3. CPU mmap tile extract + Numba CUDA kernels
4. CPU read_to_array + cupy.asarray transfer (slowest)

Also adds:
- gpu_decode_tiles_from_file(): accepts file path + offsets
  instead of pre-extracted bytes, enabling the GDS path
- _try_nvcomp_from_device_bufs(): nvCOMP on tiles already in GPU
  memory (from GDS), avoiding a device-to-host round-trip
- _apply_predictor_and_assemble(): shared GPU post-processing
  used by both GDS and mmap paths

KvikIO is optional: conda install -c rapidsai kvikio
GDS requires: NVMe SSD + NVIDIA kernel module (nvidia-fs)
diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py
@@ -588,14 +588,8 @@ def read_geotiff_gpu(source: str, *,
             return xr.DataArray(arr_gpu, dims=['y', 'x'],
                                 coords=coords, name=name, attrs=attrs)
 
-        # Extract compressed tile bytes
         offsets = ifd.tile_offsets
         byte_counts = ifd.tile_byte_counts
-        compressed_tiles = []
-        for i in range(len(offsets)):
-            compressed_tiles.append(
-                bytes(data[offsets[i]:offsets[i] + byte_counts[i]]))
-
         compression = ifd.compression
         predictor = ifd.predictor
         samples = ifd.samples_per_pixel
@@ -607,17 +601,42 @@ def read_geotiff_gpu(source: str, *,
     finally:
         src.close()
 
-    # GPU decode
+    # GPU decode: try GDS (SSD→GPU direct) first, then CPU mmap path
+    from ._gpu_decode import gpu_decode_tiles_from_file
+    arr_gpu = None
+
     try:
-        arr_gpu = gpu_decode_tiles(
-            compressed_tiles,
+        arr_gpu = gpu_decode_tiles_from_file(
+            source, offsets, byte_counts,
             tw, th, width, height,
             compression, predictor, dtype, samples,
         )
-    except ValueError:
-        # Unsupported compression -- fall back to CPU then transfer
-        arr_cpu, _ = read_to_array(source, overview_level=overview_level)
-        arr_gpu = cupy.asarray(arr_cpu)
+    except Exception:
+        pass
+
+    if arr_gpu is None:
+        # Fallback: extract tiles via CPU mmap, then GPU decode
+        src2 = _FileSource(source)
+        data2 = src2.read_all()
+        try:
+            compressed_tiles = [
+                bytes(data2[offsets[i]:offsets[i] + byte_counts[i]])
+                for i in range(len(offsets))
+            ]
+        finally:
+            src2.close()
+
+    if arr_gpu is None:
+        try:
+            arr_gpu = gpu_decode_tiles(
+                compressed_tiles,
+                tw, th, width, height,
+                compression, predictor, dtype, samples,
+            )
+        except (ValueError, Exception):
+            # Unsupported compression -- fall back to CPU then transfer
+            arr_cpu, _ = read_to_array(source, overview_level=overview_level)
+            arr_gpu = cupy.asarray(arr_cpu)
 
     # Build DataArray
     if name is None:
diff --git a/xrspatial/geotiff/_gpu_decode.py b/xrspatial/geotiff/_gpu_decode.py
@@ -672,6 +672,39 @@ def _assemble_tiles_kernel(
         output[dst_byte + b] = decompressed_buf[src_byte + b]
 
 
+# ---------------------------------------------------------------------------
+# KvikIO GDS (GPUDirect Storage) -- read file directly to GPU
+# ---------------------------------------------------------------------------
+
+def _try_kvikio_read_tiles(file_path, tile_offsets, tile_byte_counts, tile_bytes):
+    """Read compressed tile bytes directly from SSD to GPU via GDS.
+
+    When kvikio is available and GDS is supported, file data is DMA'd
+    directly from the NVMe drive to GPU VRAM, bypassing CPU entirely.
+    Falls back to None if kvikio is not installed or GDS is not available.
+
+    Returns list of cupy arrays (one per tile) on GPU, or None.
+    """
+    try:
+        import kvikio
+        import cupy
+    except ImportError:
+        return None
+
+    try:
+        d_tiles = []
+        with kvikio.CuFile(file_path, 'r') as f:
+            for off, bc in zip(tile_offsets, tile_byte_counts):
+                buf = cupy.empty(bc, dtype=cupy.uint8)
+                f.pread(buf, file_offset=off)
+                d_tiles.append(buf)
+        return d_tiles
+    except Exception:
+        # GDS not available (no NVMe, no kernel module, etc.)
+        # Fall back to normal CPU read path
+        return None
+
+
 # ---------------------------------------------------------------------------
 # nvCOMP batch decompression (optional, fast path)
 # ---------------------------------------------------------------------------
@@ -851,6 +884,175 @@ class _NvcompDeflateDecompOpts(ctypes.Structure):
 # High-level GPU decode pipeline
 # ---------------------------------------------------------------------------
 
+def gpu_decode_tiles_from_file(
+    file_path: str,
+    tile_offsets: list | tuple,
+    tile_byte_counts: list | tuple,
+    tile_width: int,
+    tile_height: int,
+    image_width: int,
+    image_height: int,
+    compression: int,
+    predictor: int,
+    dtype: np.dtype,
+    samples: int = 1,
+):
+    """Decode tiles from a file, using GDS if available.
+
+    Tries KvikIO GDS (SSD → GPU direct) first, then falls back to
+    CPU mmap + gpu_decode_tiles.
+    """
+    import cupy
+
+    # Try GDS: read compressed tiles directly from SSD to GPU
+    d_tiles = _try_kvikio_read_tiles(
+        file_path, tile_offsets, tile_byte_counts,
+        tile_width * tile_height * dtype.itemsize * samples)
+
+    if d_tiles is not None:
+        # Tiles are already on GPU as cupy arrays.
+        # Try nvCOMP batch decompress on them directly.
+        tile_bytes = tile_width * tile_height * dtype.itemsize * samples
+
+        if compression in (50000,) and _get_nvcomp() is not None:
+            # ZSTD: nvCOMP can decompress directly from GPU buffers
+            result = _try_nvcomp_from_device_bufs(
+                d_tiles, tile_bytes, compression)
+            if result is not None:
+                decomp_offsets = np.arange(len(d_tiles), dtype=np.int64) * tile_bytes
+                d_decomp = result
+                d_decomp_offsets = cupy.asarray(decomp_offsets)
+                # Apply predictor + assemble (shared code below)
+                return _apply_predictor_and_assemble(
+                    d_decomp, d_decomp_offsets, len(d_tiles),
+                    tile_width, tile_height, image_width, image_height,
+                    predictor, dtype, samples, tile_bytes)
+
+        # GDS read succeeded but nvCOMP can't decompress on GPU,
+        # or it's LZW/deflate. Copy tiles to host and use normal path.
+        compressed_tiles = [t.get().tobytes() for t in d_tiles]
+    else:
+        # No GDS -- read tiles via CPU mmap (caller provides bytes)
+        # This path is used when called from gpu_decode_tiles()
+        return None  # signal caller to use the bytes-based path
+
+    return gpu_decode_tiles(
+        compressed_tiles, tile_width, tile_height,
+        image_width, image_height, compression, predictor, dtype, samples)
+
+
+def _try_nvcomp_from_device_bufs(d_tiles, tile_bytes, compression):
+    """Run nvCOMP batch decompress on tiles already in GPU memory."""
+    import ctypes
+    import cupy
+
+    lib = _get_nvcomp()
+    if lib is None:
+        return None
+
+    class _NvcompDecompOpts(ctypes.Structure):
+        _fields_ = [('backend', ctypes.c_int), ('reserved', ctypes.c_char * 60)]
+
+    try:
+        n = len(d_tiles)
+        d_decomp_bufs = [cupy.empty(tile_bytes, dtype=cupy.uint8) for _ in range(n)]
+
+        d_comp_ptrs = cupy.array([t.data.ptr for t in d_tiles], dtype=cupy.uint64)
+        d_decomp_ptrs = cupy.array([b.data.ptr for b in d_decomp_bufs], dtype=cupy.uint64)
+        d_comp_sizes = cupy.array([t.size for t in d_tiles], dtype=cupy.uint64)
+        d_buf_sizes = cupy.full(n, tile_bytes, dtype=cupy.uint64)
+        d_actual = cupy.empty(n, dtype=cupy.uint64)
+
+        opts = _NvcompDecompOpts(backend=0, reserved=b'\x00' * 60)
+
+        fn_name = {50000: 'nvcompBatchedZstdDecompressGetTempSizeAsync'}.get(compression)
+        dec_name = {50000: 'nvcompBatchedZstdDecompressAsync'}.get(compression)
+        if fn_name is None:
+            return None
+
+        temp_fn = getattr(lib, fn_name)
+        temp_fn.restype = ctypes.c_int
+        temp_size = ctypes.c_size_t(0)
+        s = temp_fn(n, tile_bytes, opts, ctypes.byref(temp_size), n * tile_bytes)
+        if s != 0:
+            return None
+
+        ts = max(temp_size.value, 1)
+        d_temp = cupy.empty(ts, dtype=cupy.uint8)
+        d_statuses = cupy.zeros(n, dtype=cupy.int32)
+
+        dec_fn = getattr(lib, dec_name)
+        dec_fn.restype = ctypes.c_int
+        s = dec_fn(
+            ctypes.c_void_p(d_comp_ptrs.data.ptr),
+            ctypes.c_void_p(d_comp_sizes.data.ptr),
+            ctypes.c_void_p(d_buf_sizes.data.ptr),
+            ctypes.c_void_p(d_actual.data.ptr),
+            ctypes.c_size_t(n),
+            ctypes.c_void_p(d_temp.data.ptr), ctypes.c_size_t(ts),
+            ctypes.c_void_p(d_decomp_ptrs.data.ptr),
+            opts,
+            ctypes.c_void_p(d_statuses.data.ptr),
+            ctypes.c_void_p(0),
+        )
+        if s != 0:
+            return None
+
+        cupy.cuda.Device().synchronize()
+        if int(cupy.any(d_statuses != 0)):
+            return None
+
+        return cupy.concatenate(d_decomp_bufs)
+    except Exception:
+        return None
+
+
+def _apply_predictor_and_assemble(d_decomp, d_decomp_offsets, n_tiles,
+                                    tile_width, tile_height,
+                                    image_width, image_height,
+                                    predictor, dtype, samples, tile_bytes):
+    """Apply predictor decode and tile assembly on GPU."""
+    import cupy
+
+    bytes_per_pixel = dtype.itemsize * samples
+
+    if predictor == 2:
+        total_rows = n_tiles * tile_height
+        tpb = min(256, total_rows)
+        bpg = math.ceil(total_rows / tpb)
+        _predictor_decode_kernel[bpg, tpb](
+            d_decomp, tile_width * samples, total_rows, dtype.itemsize * samples)
+        cuda.synchronize()
+    elif predictor == 3:
+        total_rows = n_tiles * tile_height
+        tpb = min(256, total_rows)
+        bpg = math.ceil(total_rows / tpb)
+        d_tmp = cupy.empty_like(d_decomp)
+        _fp_predictor_decode_kernel[bpg, tpb](
+            d_decomp, d_tmp, tile_width * samples, total_rows, dtype.itemsize)
+        cuda.synchronize()
+
+    tiles_across = math.ceil(image_width / tile_width)
+    total_pixels = image_width * image_height
+    d_output = cupy.empty(total_pixels * bytes_per_pixel, dtype=cupy.uint8)
+
+    tpb = 256
+    bpg = math.ceil(total_pixels / tpb)
+    _assemble_tiles_kernel[bpg, tpb](
+        d_decomp, d_decomp_offsets,
+        tile_width, tile_height, bytes_per_pixel,
+        image_width, image_height, tiles_across,
+        d_output,
+    )
+    cuda.synchronize()
+
+    if samples > 1:
+        return d_output.view(dtype=cupy.dtype(dtype)).reshape(
+            image_height, image_width, samples)
+    return d_output.view(dtype=cupy.dtype(dtype)).reshape(
+        image_height, image_width)
+
+
 def gpu_decode_tiles(
     compressed_tiles: list[bytes],
     tile_width: int,