Skip to content

Commit 25c0d84

Browse files
committed
Add nvCOMP batch decompression fast path for GPU reads
gpu_decode_tiles() now tries kvikio.nvcomp.DeflateManager for batch deflate decompression before falling back to the Numba CUDA inflate kernel. nvCOMP is NVIDIA's optimized batched compression library that decompresses all tiles in a single GPU API call. Fallback chain for GPU decompression: 1. nvCOMP via kvikio (if installed) -- optimized CUDA kernels 2. Numba @cuda.jit inflate kernel -- pure Python/Numba implementation 3. CPU zlib fallback -- if GPU decode raises any error kvikio is an optional dependency (pip install kvikio-cu12 or conda install -c rapidsai kvikio). When not installed, the Numba kernels are used transparently.
1 parent 95c2a48 commit 25c0d84

File tree

1 file changed

+62
-1
lines changed

1 file changed

+62
-1
lines changed

xrspatial/geotiff/_gpu_decode.py

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -672,6 +672,60 @@ def _assemble_tiles_kernel(
672672
output[dst_byte + b] = decompressed_buf[src_byte + b]
673673

674674

675+
# ---------------------------------------------------------------------------
676+
# nvCOMP batch decompression (optional, fast path)
677+
# ---------------------------------------------------------------------------
678+
679+
def _try_nvcomp_batch_decompress(compressed_tiles, tile_bytes, compression):
680+
"""Try batch decompression via nvCOMP. Returns CuPy array or None.
681+
682+
nvCOMP (NVIDIA's batched compression library) decompresses all tiles
683+
in a single GPU API call using optimized CUDA kernels. Falls back
684+
to None if nvCOMP is not available or doesn't support the codec.
685+
"""
686+
try:
687+
import kvikio.nvcomp as nvcomp
688+
except ImportError:
689+
return None
690+
691+
import cupy
692+
693+
codec_map = {
694+
8: 'deflate', # Deflate
695+
32946: 'deflate', # Adobe Deflate
696+
5: 'lzw', # LZW (nvCOMP doesn't support TIFF LZW variant)
697+
}
698+
codec_name = codec_map.get(compression)
699+
if codec_name is None:
700+
return None
701+
702+
# nvCOMP's DeflateManager handles batch deflate
703+
if codec_name == 'deflate':
704+
try:
705+
# Strip 2-byte zlib headers + 4-byte checksums from each tile
706+
raw_tiles = []
707+
for tile in compressed_tiles:
708+
# zlib format: 2-byte header, deflate data, 4-byte adler32
709+
raw_tiles.append(tile[2:-4] if len(tile) > 6 else tile)
710+
711+
manager = nvcomp.DeflateManager(chunk_size=tile_bytes)
712+
713+
# Copy compressed data to device
714+
d_compressed = [cupy.asarray(np.frombuffer(t, dtype=np.uint8))
715+
for t in raw_tiles]
716+
717+
# Batch decompress
718+
d_decompressed = manager.decompress(d_compressed)
719+
720+
# Concatenate results into a single buffer
721+
result = cupy.concatenate([d.ravel() for d in d_decompressed])
722+
return result
723+
except Exception:
724+
return None
725+
726+
return None
727+
728+
675729
# ---------------------------------------------------------------------------
676730
# High-level GPU decode pipeline
677731
# ---------------------------------------------------------------------------
@@ -717,7 +771,14 @@ def gpu_decode_tiles(
717771
bytes_per_pixel = dtype.itemsize * samples
718772
tile_bytes = tile_width * tile_height * bytes_per_pixel
719773

720-
if compression == 5: # LZW
774+
# Try nvCOMP batch decompression first (much faster if available)
775+
nvcomp_result = _try_nvcomp_batch_decompress(
776+
compressed_tiles, tile_bytes, compression)
777+
if nvcomp_result is not None:
778+
d_decomp = nvcomp_result
779+
decomp_offsets = np.arange(n_tiles, dtype=np.int64) * tile_bytes
780+
d_decomp_offsets = cupy.asarray(decomp_offsets)
781+
elif compression == 5: # LZW
721782
# Concatenate all compressed tiles into one device buffer
722783
comp_sizes = [len(t) for t in compressed_tiles]
723784
comp_offsets = np.zeros(n_tiles, dtype=np.int64)

0 commit comments

Comments
 (0)