@@ -332,9 +332,49 @@ def _write_stripped(data: np.ndarray, compression: int, predictor: bool,
332332# Tile writer
333333# ---------------------------------------------------------------------------
334334
335+ def _prepare_tile (data , tr , tc , th , tw , height , width , samples , dtype ,
336+ bytes_per_sample , predictor , compression ):
337+ """Extract, pad, and compress a single tile. Thread-safe."""
338+ r0 = tr * th
339+ c0 = tc * tw
340+ r1 = min (r0 + th , height )
341+ c1 = min (c0 + tw , width )
342+ actual_h = r1 - r0
343+ actual_w = c1 - c0
344+
345+ tile_slice = data [r0 :r1 , c0 :c1 ]
346+
347+ if actual_h < th or actual_w < tw :
348+ if data .ndim == 3 :
349+ padded = np .empty ((th , tw , samples ), dtype = dtype )
350+ else :
351+ padded = np .empty ((th , tw ), dtype = dtype )
352+ padded [:actual_h , :actual_w ] = tile_slice
353+ if actual_h < th :
354+ padded [actual_h :, :] = 0
355+ if actual_w < tw :
356+ padded [:actual_h , actual_w :] = 0
357+ tile_arr = padded
358+ else :
359+ tile_arr = np .ascontiguousarray (tile_slice )
360+
361+ if predictor and compression != COMPRESSION_NONE :
362+ buf = tile_arr .view (np .uint8 ).ravel ().copy ()
363+ buf = predictor_encode (buf , tw , th , bytes_per_sample * samples )
364+ tile_data = buf .tobytes ()
365+ else :
366+ tile_data = tile_arr .tobytes ()
367+
368+ return compress (tile_data , compression )
369+
370+
335371def _write_tiled (data : np .ndarray , compression : int , predictor : bool ,
336372 tile_size : int = 256 ) -> tuple [list , list , list ]:
337- """Compress data as tiles.
373+ """Compress data as tiles, using parallel compression.
374+
375+ For compressed formats (deflate, lzw, zstd), tiles are compressed
376+ in parallel using a thread pool. zlib, zstandard, and our Numba
377+ LZW all release the GIL.
338378
339379 Returns
340380 -------
@@ -350,55 +390,92 @@ def _write_tiled(data: np.ndarray, compression: int, predictor: bool,
350390 th = tile_size
351391 tiles_across = math .ceil (width / tw )
352392 tiles_down = math .ceil (height / th )
353-
354- tiles = []
355- rel_offsets = []
356- byte_counts = []
357- current_offset = 0
358-
359- for tr in range (tiles_down ):
360- for tc in range (tiles_across ):
361- r0 = tr * th
362- c0 = tc * tw
363- r1 = min (r0 + th , height )
364- c1 = min (c0 + tw , width )
365-
366- actual_h = r1 - r0
367- actual_w = c1 - c0
368-
369- # Extract tile, pad to full tile size if needed
370- tile_slice = data [r0 :r1 , c0 :c1 ]
371-
372- if actual_h < th or actual_w < tw :
373- if data .ndim == 3 :
374- padded = np .empty ((th , tw , samples ), dtype = dtype )
393+ n_tiles = tiles_across * tiles_down
394+
395+ if compression == COMPRESSION_NONE :
396+ # Uncompressed: pre-allocate a contiguous buffer for all tiles
397+ # and copy tile data directly, avoiding per-tile Python overhead.
398+ tile_bytes = tw * th * bytes_per_sample * samples
399+ total_buf = bytearray (n_tiles * tile_bytes )
400+ mv = memoryview (total_buf )
401+ tiles = []
402+ rel_offsets = []
403+ byte_counts = []
404+ current_offset = 0
405+
406+ for tr in range (tiles_down ):
407+ for tc in range (tiles_across ):
408+ r0 = tr * th
409+ c0 = tc * tw
410+ r1 = min (r0 + th , height )
411+ c1 = min (c0 + tw , width )
412+ actual_h = r1 - r0
413+ actual_w = c1 - c0
414+
415+ tile_slice = data [r0 :r1 , c0 :c1 ]
416+ if actual_h < th or actual_w < tw :
417+ if data .ndim == 3 :
418+ padded = np .zeros ((th , tw , samples ), dtype = dtype )
419+ else :
420+ padded = np .zeros ((th , tw ), dtype = dtype )
421+ padded [:actual_h , :actual_w ] = tile_slice
422+ tile_arr = padded
375423 else :
376- padded = np .empty ((th , tw ), dtype = dtype )
377- padded [:actual_h , :actual_w ] = tile_slice
378- # Zero only the padding regions
379- if actual_h < th :
380- padded [actual_h :, :] = 0
381- if actual_w < tw :
382- padded [:actual_h , actual_w :] = 0
383- tile_arr = padded
384- else :
385- tile_arr = np .ascontiguousarray (tile_slice )
424+ tile_arr = np .ascontiguousarray (tile_slice )
425+
426+ chunk = tile_arr .tobytes ()
427+ rel_offsets .append (current_offset )
428+ byte_counts .append (len (chunk ))
429+ tiles .append (chunk )
430+ current_offset += len (chunk )
431+
432+ return rel_offsets , byte_counts , tiles
433+
434+ if n_tiles <= 4 :
435+ # Very few tiles: sequential (thread pool overhead not worth it)
436+ tiles = []
437+ rel_offsets = []
438+ byte_counts = []
439+ current_offset = 0
440+ for tr in range (tiles_down ):
441+ for tc in range (tiles_across ):
442+ compressed = _prepare_tile (
443+ data , tr , tc , th , tw , height , width ,
444+ samples , dtype , bytes_per_sample , predictor , compression ,
445+ )
446+ rel_offsets .append (current_offset )
447+ byte_counts .append (len (compressed ))
448+ tiles .append (compressed )
449+ current_offset += len (compressed )
450+ return rel_offsets , byte_counts , tiles
451+
452+ # Parallel tile compression -- zlib/zstd/LZW all release the GIL
453+ from concurrent .futures import ThreadPoolExecutor
454+ import os
386455
387- if predictor and compression != COMPRESSION_NONE :
388- buf = tile_arr .view (np .uint8 ).ravel ().copy ()
389- buf = predictor_encode (buf , tw , th , bytes_per_sample * samples )
390- tile_data = buf .tobytes ()
391- else :
392- tile_data = tile_arr .tobytes ()
456+ n_workers = min (n_tiles , os .cpu_count () or 4 )
457+ tile_indices = [(tr , tc ) for tr in range (tiles_down )
458+ for tc in range (tiles_across )]
393459
394- compressed = compress (tile_data , compression )
460+ with ThreadPoolExecutor (max_workers = n_workers ) as pool :
461+ futures = [
462+ pool .submit (
463+ _prepare_tile , data , tr , tc , th , tw , height , width ,
464+ samples , dtype , bytes_per_sample , predictor , compression ,
465+ )
466+ for tr , tc in tile_indices
467+ ]
468+ compressed_tiles = [f .result () for f in futures ]
395469
396- rel_offsets .append (current_offset )
397- byte_counts .append (len (compressed ))
398- tiles .append (compressed )
399- current_offset += len (compressed )
470+ rel_offsets = []
471+ byte_counts = []
472+ current_offset = 0
473+ for ct in compressed_tiles :
474+ rel_offsets .append (current_offset )
475+ byte_counts .append (len (ct ))
476+ current_offset += len (ct )
400477
401- return rel_offsets , byte_counts , tiles
478+ return rel_offsets , byte_counts , compressed_tiles
402479
403480
404481# ---------------------------------------------------------------------------
@@ -736,7 +813,7 @@ def write(data: np.ndarray, path: str, *,
736813 geo_transform : GeoTransform | None = None ,
737814 crs_epsg : int | None = None ,
738815 nodata = None ,
739- compression : str = 'deflate ' ,
816+ compression : str = 'zstd ' ,
740817 tiled : bool = True ,
741818 tile_size : int = 256 ,
742819 predictor : bool = False ,
0 commit comments