-
Notifications
You must be signed in to change notification settings - Fork 85
Expand file tree
/
Copy path__init__.py
More file actions
3395 lines (3059 loc) · 141 KB
/
__init__.py
File metadata and controls
3395 lines (3059 loc) · 141 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""Lightweight GeoTIFF/COG reader and writer.
No GDAL dependency -- uses only numpy, numba, xarray, and the standard library.
Public API
----------
open_geotiff(source, ...)
Read a GeoTIFF, COG, or VRT file to an xarray.DataArray. Auto-dispatches
to the GPU, dask, or numpy backend based on the ``gpu`` and ``chunks``
kwargs.
read_geotiff_gpu(source, ...)
GPU-only read returning a CuPy-backed DataArray. ``open_geotiff(...,
gpu=True)`` calls this internally; use the explicit name when you want
the strict-mode failure semantics (``on_gpu_failure='strict'``) or want
to bypass auto-dispatch.
read_geotiff_dask(source, ...)
Dask-only read returning a windowed lazy DataArray. ``open_geotiff(...,
chunks=N)`` calls this internally.
read_vrt(source, ...)
Read a GDAL Virtual Raster Table (.vrt). ``open_geotiff`` routes ``.vrt``
paths here automatically; the explicit entry point is useful for
callers that already know they have a VRT.
to_geotiff(data, path, ...)
Write an xarray.DataArray as a GeoTIFF or COG. Auto-dispatches to GPU
when the data is CuPy-backed.
write_geotiff_gpu(data, path, ...)
GPU-only writer using nvCOMP. ``to_geotiff(..., gpu=True)`` calls this
internally.
write_vrt(vrt_path, source_files, ...)
Generate a VRT mosaic XML from a list of GeoTIFF files.
"""
from __future__ import annotations
import math
import os
import warnings
from typing import TYPE_CHECKING
import numpy as np
import xarray as xr
if TYPE_CHECKING:
from typing import BinaryIO
from ._geotags import GeoTransform, RASTER_PIXEL_IS_AREA, RASTER_PIXEL_IS_POINT
from ._reader import read_to_array
from ._writer import write
# All names below are part of the supported public API. ``plot_geotiff``
# is intentionally omitted: it is deprecated in favour of ``da.xrs.plot()``
# and emits a ``DeprecationWarning`` when called.
__all__ = [
'GeoTIFFFallbackWarning',
'open_geotiff',
'read_geotiff_gpu',
'read_geotiff_dask',
'read_vrt',
'to_geotiff',
'write_geotiff_gpu',
'write_vrt',
]
# Sentinels distinguishing "user passed this kwarg explicitly" from "user
# passed nothing". A plain default of None does not work because None is
# itself a value a caller could supply. ``read_geotiff_gpu`` needs both
# sentinels so it can tell whether the deprecated ``gpu=`` and the new
# ``on_gpu_failure=`` were *each* supplied, and refuse the ambiguous
# both-supplied case regardless of which values were chosen.
# ``open_geotiff`` also uses ``_ON_GPU_FAILURE_SENTINEL`` to distinguish
# "caller never set on_gpu_failure" (default sentinel: skip forwarding so
# the read_geotiff_gpu signature default applies) from "caller set
# on_gpu_failure=<value>" (forward verbatim).
_GPU_DEPRECATED_SENTINEL = object()
_ON_GPU_FAILURE_SENTINEL = object()
# Names of dims that ``to_geotiff`` / ``write_geotiff_gpu`` treat as the
# non-spatial band axis. Used both to remap ``(band, y, x)`` inputs to
# ``(y, x, band)`` before writing and to skip the band axis when inferring
# a GeoTransform from coords (see ``_coords_to_transform`` and issue #1643).
_BAND_DIM_NAMES = ('band', 'bands', 'channel')
class GeoTIFFFallbackWarning(UserWarning):
"""Warning emitted when a geotiff helper falls back to a slower path.
Raised in the same call sites that would silently return ``None`` under
the historic ``except Exception: return None`` pattern. See issue #1662
for the audit and the ``XRSPATIAL_GEOTIFF_STRICT=1`` env var that
promotes these warnings to exceptions.
"""
def _geotiff_strict_mode() -> bool:
"""Return True when ``XRSPATIAL_GEOTIFF_STRICT`` is set to a truthy value.
Strict mode promotes the silent fallbacks audited in issue #1662 into
raised exceptions. Useful in CI to catch GPU-path or VRT regressions
that would otherwise hide behind a CPU fallback or a missing tile.
"""
return os.environ.get(
'XRSPATIAL_GEOTIFF_STRICT', '').lower() in ('1', 'true', 'yes')
def _wkt_to_epsg(wkt_or_proj: str) -> int | None:
"""Try to extract an EPSG code from a WKT or PROJ string.
Returns None if pyproj is not installed or the string can't be parsed.
Under ``XRSPATIAL_GEOTIFF_STRICT=1`` the underlying exception is
re-raised instead of being swallowed. In the default mode a
``GeoTIFFFallbackWarning`` is emitted so callers can tell
pyproj-missing from pyproj-broken-input.
"""
try:
from pyproj import CRS
crs = CRS.from_user_input(wkt_or_proj)
epsg = crs.to_epsg()
return epsg
except Exception as e:
if _geotiff_strict_mode():
raise
warnings.warn(
f"_wkt_to_epsg failed ({type(e).__name__}: {e}); returning None.",
GeoTIFFFallbackWarning,
stacklevel=2,
)
return None
def _geo_to_coords(geo_info, height: int, width: int) -> dict:
"""Build y/x coordinate arrays from GeoInfo.
For PixelIsArea (default): origin is the edge of pixel (0,0), so pixel
centers are at origin + 0.5*pixel_size.
For PixelIsPoint: origin (tiepoint) is already the center of pixel (0,0),
so no half-pixel offset is needed.
Returned coords are pixel-center values in either raster type, matching
xarray convention. The raw GeoTransform (origin and pixel size) is
preserved separately on the DataArray as a rasterio-style 6-tuple in
``attrs['transform']``: ``(pixel_width, 0, origin_x, 0, pixel_height,
origin_y)``. ``to_geotiff`` prefers that attr over recomputing the
transform from the coord arrays, which avoids float drift on
fractional-precision rasters.
When the file carries no GeoTIFF tags (``has_georef=False``), fall back
to integer pixel coordinates 0..N-1 instead of inventing fractional
values from the default unit transform.
"""
if not getattr(geo_info, 'has_georef', True):
return {
'y': np.arange(height, dtype=np.int64),
'x': np.arange(width, dtype=np.int64),
}
t = geo_info.transform
if geo_info.raster_type == RASTER_PIXEL_IS_POINT:
# Tiepoint is pixel center -- no offset needed
x = np.arange(width, dtype=np.float64) * t.pixel_width + t.origin_x
y = np.arange(height, dtype=np.float64) * t.pixel_height + t.origin_y
else:
# Tiepoint is pixel edge -- shift to center
x = np.arange(width, dtype=np.float64) * t.pixel_width + t.origin_x + t.pixel_width * 0.5
y = np.arange(height, dtype=np.float64) * t.pixel_height + t.origin_y + t.pixel_height * 0.5
return {'y': y, 'x': x}
def _transform_tuple(geo_info) -> tuple | None:
"""Return the rasterio-style 6-tuple for a GeoInfo's transform.
Format: ``(pixel_width, 0.0, origin_x, 0.0, pixel_height, origin_y)``.
This matches ``rasterio.Affine.to_gdal()``-adjacent ordering used by
rioxarray's ``rio.transform()`` output. Storing the tuple on the
DataArray lets ``to_geotiff`` reproduce the source GeoTransform
byte-for-byte, side-stepping float drift in the y/x coord arrays.
"""
if geo_info is None:
return None
t = geo_info.transform
if t is None:
return None
return (
float(t.pixel_width), 0.0, float(t.origin_x),
0.0, float(t.pixel_height), float(t.origin_y),
)
def _transform_from_attr(attr_val) -> 'GeoTransform | None':
"""Build a GeoTransform from an ``attrs['transform']`` value.
Accepts a 6-tuple ``(a, b, c, d, e, f)`` (rasterio Affine ordering;
``b`` and ``d`` are ignored, only axis-aligned affines round-trip),
a 6-tuple GDAL ordering ``(c, a, b, f, d, e)`` is NOT accepted, or
a ``GeoTransform`` instance. Returns None for anything else.
"""
if attr_val is None:
return None
if isinstance(attr_val, GeoTransform):
return attr_val
try:
seq = tuple(attr_val)
except TypeError:
return None
if len(seq) != 6:
return None
try:
a, _b, c, _d, e, f = (float(x) for x in seq)
except (TypeError, ValueError):
return None
return GeoTransform(
origin_x=c, origin_y=f, pixel_width=a, pixel_height=e,
)
def _validate_dtype_cast(source_dtype, target_dtype):
"""Validate that casting source_dtype to target_dtype is allowed.
Raises ValueError for float-to-int casts (lossy in a way users
often don't intend). All other casts are permitted -- the user
asked for them explicitly.
"""
src = np.dtype(source_dtype)
tgt = np.dtype(target_dtype)
if src.kind == 'f' and tgt.kind in ('u', 'i'):
raise ValueError(
f"Cannot cast float ({src}) to int ({tgt}). "
f"This loses fractional data and is usually unintentional. "
f"Cast explicitly after reading if you really want this.")
def _coords_to_transform(da: xr.DataArray) -> GeoTransform | None:
"""Infer GeoTransform from DataArray coordinates.
Coordinates are always pixel-center values. The transform origin depends
on raster_type:
- PixelIsArea (default): origin = center - half_pixel (edge of pixel 0)
- PixelIsPoint: origin = center (center of pixel 0)
For 3D arrays the spatial dims are the two non-band dims. The helper
filters out any dim named ``band`` / ``bands`` / ``channel`` (see
``_BAND_DIM_NAMES``) regardless of position, so a ``(y, x, band)``,
``(band, y, x)``, or ``(y, band, x)`` DataArray returns the y/x
transform rather than picking up the band axis spacing as a pixel
size. ``to_geotiff`` itself remaps ``(band, y, x)`` arrays to
``(y, x, band)`` before writing pixel bytes, but it calls
:func:`_coords_to_transform` against the original DataArray, so the
helper must handle both layouts to keep the geo-transform consistent
with the file's coord arrays. See issue #1643.
"""
if da.ndim == 3:
# Drop the band-like dim and keep the two spatial dims in their
# original (y, x) order. Position-based fallback covers the case
# where none of the dims are named like a band axis.
spatial = tuple(d for d in da.dims if d not in _BAND_DIM_NAMES)
if len(spatial) == 2:
ydim, xdim = spatial[0], spatial[1]
else:
# No identifiable band dim; fall back to dims[-2:] so the
# original 2-D-style behaviour applies. This branch only
# triggers for unusual 3D layouts callers built by hand.
ydim = da.dims[-2]
xdim = da.dims[-1]
else:
ydim = da.dims[-2]
xdim = da.dims[-1]
if xdim not in da.coords or ydim not in da.coords:
return None
x = da.coords[xdim].values
y = da.coords[ydim].values
if len(x) < 2 or len(y) < 2:
return None
pixel_width = float(x[1] - x[0])
pixel_height = float(y[1] - y[0])
is_point = da.attrs.get('raster_type') == 'point'
if is_point:
# PixelIsPoint: tiepoint is at the pixel center
origin_x = float(x[0])
origin_y = float(y[0])
else:
# PixelIsArea: tiepoint is at the edge (center - half pixel)
origin_x = float(x[0]) - pixel_width * 0.5
origin_y = float(y[0]) - pixel_height * 0.5
return GeoTransform(
origin_x=origin_x,
origin_y=origin_y,
pixel_width=pixel_width,
pixel_height=pixel_height,
)
def _read_geo_info(source, *, overview_level: int | None = None):
"""Read only the geographic metadata and image dimensions from a GeoTIFF.
Returns (geo_info, height, width, dtype, n_bands) without reading pixel
data. Uses mmap for header-only access on string paths; for file-like
inputs it reads the bytes directly. O(1) memory regardless of file size
when a path is supplied.
Parameters
----------
source : str or binary file-like
Path or any object with ``read``/``seek``.
overview_level : int or None
Overview IFD index (0 = full resolution).
"""
from ._dtypes import resolve_bits_per_sample, tiff_dtype_to_numpy
from ._geotags import extract_geo_info_with_overview_inheritance
from ._header import parse_all_ifds, parse_header, select_overview_ifd
from ._reader import _coerce_path, _is_file_like
source = _coerce_path(source)
if _is_file_like(source):
# File-like: read its full bytes; we don't try to mmap arbitrary
# buffers because they may not back a real file descriptor.
try:
cur = source.tell()
except (OSError, AttributeError):
cur = 0
source.seek(0)
data = source.read()
try:
source.seek(cur)
except (OSError, AttributeError):
pass
close_data = False
elif isinstance(source, str):
with open(source, 'rb') as f:
import mmap
data = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
close_data = True
else:
raise TypeError(
"source must be a str path or binary file-like, "
f"got {type(source).__name__}")
try:
header = parse_header(data)
ifds = parse_all_ifds(data, header)
if not ifds:
raise ValueError("No IFDs found in TIFF file")
ifd = select_overview_ifd(ifds, overview_level)
# Inherit georef from the level-0 IFD when the overview itself
# has no geokeys (issue #1640). Pass-through for level 0.
geo_info = extract_geo_info_with_overview_inheritance(
ifd, ifds, data, header.byte_order)
bps = resolve_bits_per_sample(ifd.bits_per_sample)
file_dtype = tiff_dtype_to_numpy(bps, ifd.sample_format)
n_bands = ifd.samples_per_pixel if ifd.samples_per_pixel > 1 else 0
return geo_info, ifd.height, ifd.width, file_dtype, n_bands
finally:
if close_data:
data.close()
def _extent_to_window(transform, file_height, file_width,
y_min, y_max, x_min, x_max):
"""Convert geographic extent to pixel window (row_start, col_start, row_stop, col_stop).
Clamps to file bounds.
"""
# Pixel coords from geographic coords
col_start = (x_min - transform.origin_x) / transform.pixel_width
col_stop = (x_max - transform.origin_x) / transform.pixel_width
row_start = (y_max - transform.origin_y) / transform.pixel_height
row_stop = (y_min - transform.origin_y) / transform.pixel_height
# pixel_height is typically negative, so row_start/row_stop may be swapped
if row_start > row_stop:
row_start, row_stop = row_stop, row_start
if col_start > col_stop:
col_start, col_stop = col_stop, col_start
row_start = max(0, int(np.floor(row_start)))
col_start = max(0, int(np.floor(col_start)))
row_stop = min(file_height, int(np.ceil(row_stop)))
col_stop = min(file_width, int(np.ceil(col_stop)))
return (row_start, col_start, row_stop, col_stop)
def _populate_attrs_from_geo_info(attrs: dict, geo_info, *, window=None) -> None:
"""Populate ``attrs`` with all GeoTIFF metadata from ``geo_info``.
Centralised so the eager numpy, dask, and GPU read paths emit the
same attrs keys for the same input file. Mutates ``attrs`` in place.
The ``nodata`` attr is intentionally NOT set here because each caller
sets it next to its own nodata-masking step (the value's presence in
attrs signals "this array has been NaN-masked").
``window`` is a ``(r0, c0, r1, c1)`` tuple for windowed reads; when
set, the emitted ``attrs['transform']`` shifts the origin to the
window's top-left. The eager path and the dask path (since #1561,
which threads ``window=`` through ``read_geotiff_dask``) both pass
the outer window through this helper so the resulting DataArray
advertises the windowed transform. The GPU path does not currently
expose a windowed read, so it passes ``window=None``.
"""
if geo_info.crs_epsg is not None:
attrs['crs'] = geo_info.crs_epsg
if geo_info.crs_wkt is not None:
attrs['crs_wkt'] = geo_info.crs_wkt
if geo_info.raster_type == RASTER_PIXEL_IS_POINT:
attrs['raster_type'] = 'point'
src_t = geo_info.transform
if src_t is not None:
if window is not None:
r0, c0, _r1, _c1 = window
origin_x_w = float(src_t.origin_x) + c0 * float(src_t.pixel_width)
origin_y_w = float(src_t.origin_y) + r0 * float(src_t.pixel_height)
attrs['transform'] = (
float(src_t.pixel_width), 0.0, origin_x_w,
0.0, float(src_t.pixel_height), origin_y_w,
)
else:
t_tuple = _transform_tuple(geo_info)
if t_tuple is not None:
attrs['transform'] = t_tuple
if geo_info.crs_name is not None:
attrs['crs_name'] = geo_info.crs_name
if geo_info.geog_citation is not None:
attrs['geog_citation'] = geo_info.geog_citation
if geo_info.datum_code is not None:
attrs['datum_code'] = geo_info.datum_code
if geo_info.angular_units is not None:
attrs['angular_units'] = geo_info.angular_units
if geo_info.linear_units is not None:
attrs['linear_units'] = geo_info.linear_units
if geo_info.semi_major_axis is not None:
attrs['semi_major_axis'] = geo_info.semi_major_axis
if geo_info.inv_flattening is not None:
attrs['inv_flattening'] = geo_info.inv_flattening
if geo_info.projection_code is not None:
attrs['projection_code'] = geo_info.projection_code
if geo_info.vertical_epsg is not None:
attrs['vertical_crs'] = geo_info.vertical_epsg
if geo_info.vertical_citation is not None:
attrs['vertical_citation'] = geo_info.vertical_citation
if geo_info.vertical_units is not None:
attrs['vertical_units'] = geo_info.vertical_units
if geo_info.gdal_metadata is not None:
attrs['gdal_metadata'] = geo_info.gdal_metadata
if geo_info.gdal_metadata_xml is not None:
attrs['gdal_metadata_xml'] = geo_info.gdal_metadata_xml
if geo_info.extra_tags is not None:
attrs['extra_tags'] = geo_info.extra_tags
if geo_info.image_description is not None:
attrs['image_description'] = geo_info.image_description
if geo_info.extra_samples is not None:
attrs['extra_samples'] = geo_info.extra_samples
if geo_info.x_resolution is not None:
attrs['x_resolution'] = geo_info.x_resolution
if geo_info.y_resolution is not None:
attrs['y_resolution'] = geo_info.y_resolution
if geo_info.resolution_unit is not None:
_unit_names = {1: 'none', 2: 'inch', 3: 'centimeter'}
attrs['resolution_unit'] = _unit_names.get(
geo_info.resolution_unit, str(geo_info.resolution_unit))
if geo_info.colormap is not None:
try:
from matplotlib.colors import ListedColormap
attrs['cmap'] = ListedColormap(
geo_info.colormap, name='tiff_palette')
attrs['colormap_rgba'] = geo_info.colormap
except ImportError:
attrs['colormap_rgba'] = geo_info.colormap
if geo_info.extra_tags is not None:
for _tag_id, _tt, _tc, _tv in geo_info.extra_tags:
if _tag_id == 320: # TAG_COLORMAP
attrs['colormap'] = _tv
break
def open_geotiff(source, *, dtype=None,
window: tuple | None = None,
overview_level: int | None = None,
band: int | None = None,
name: str | None = None,
chunks: int | tuple | None = None,
gpu: bool = False,
max_pixels: int | None = None,
on_gpu_failure: str = _ON_GPU_FAILURE_SENTINEL,
) -> xr.DataArray:
"""Read a GeoTIFF, COG, or VRT file into an xarray.DataArray.
Automatically dispatches to the best backend:
- ``gpu=True``: GPU-accelerated read via nvCOMP (returns CuPy)
- ``chunks=N``: Dask lazy read via windowed chunks
- ``gpu=True, chunks=N``: Dask+CuPy for out-of-core GPU pipelines
- Default: NumPy eager read
VRT files are auto-detected by extension.
Parameters
----------
source : str or binary file-like
File path, HTTP URL, cloud URI (s3://, gs://, az://), or a
binary file-like object (e.g. ``io.BytesIO``) with read+seek.
VRT, dask-chunked, GPU, and remote-URL paths require a string;
in-memory file-like buffers go through the eager numpy reader.
dtype : str, numpy.dtype, or None
Cast the result to this dtype after reading. None keeps the
file's native dtype. Float-to-int casts raise ValueError to
prevent accidental data loss.
window : tuple or None
(row_start, col_start, row_stop, col_stop) for windowed reading.
overview_level : int or None
Overview level (0 = full resolution).
band : int or None
Band index (0-based). None returns all bands.
name : str or None
Name for the DataArray.
chunks : int, tuple, or None
Chunk size for Dask lazy reading.
gpu : bool
Use GPU-accelerated decompression (requires cupy + nvCOMP).
max_pixels : int or None
Maximum allowed pixel count (width * height * samples). None
uses the default (~1 billion). Raise to read legitimately
large files.
on_gpu_failure : {'auto', 'strict'}, optional
Forwarded to ``read_geotiff_gpu`` when ``gpu=True``. Controls
whether GPU decode failures fall back to CPU (``'auto'``,
default) or re-raise the original exception (``'strict'``).
Passing this kwarg with ``gpu=False`` raises ``ValueError``
because the policy only applies to the GPU pipeline. See
``read_geotiff_gpu`` for the full description.
Returns
-------
xr.DataArray
NumPy, Dask, CuPy, or Dask+CuPy backed depending on options.
Notes
-----
The CRS is stored as an int EPSG code in ``attrs['crs']`` whenever the
file's GeoKeys carry a recognized EPSG. Files whose CRS can only be
expressed as WKT keep the WKT in ``attrs['crs_wkt']`` and leave
``attrs['crs']`` unset. ``to_geotiff`` accepts either an int EPSG or a
WKT string in ``attrs['crs']`` for backward compatibility.
The file's GeoTransform is also surfaced as ``attrs['transform']``,
a rasterio-style 6-tuple
``(pixel_width, 0, origin_x, 0, pixel_height, origin_y)``. ``to_geotiff``
uses this attr verbatim when present, falling back to recomputing the
transform from the y/x coord arrays only when it is missing. The attr
is what makes write -> read -> write -> read round-trips bit-stable for
rasters with fractional pixel sizes or origins.
Integer rasters with a nodata sentinel are silently promoted to
``float64`` with NaN replacing the sentinel so downstream NaN-aware
code works uniformly. Pass ``dtype=...`` to keep the source dtype
(the cast will fail with ``ValueError`` for float-to-int because that
is lossy in a way users rarely intend; cast explicitly after read if
you need it).
"""
from ._reader import _coerce_path
source = _coerce_path(source)
# ``on_gpu_failure`` is GPU-only. Reject it up front for CPU/dask paths
# rather than silently dropping it once dispatch is decided -- callers
# otherwise have no way to learn that the policy is being ignored.
# ``gpu=False`` (the default) on a ``.vrt`` source still routes through
# ``read_vrt`` below which has no GPU-failure concept, so the same
# rejection rule applies there.
if on_gpu_failure is not _ON_GPU_FAILURE_SENTINEL and not gpu:
raise ValueError(
"on_gpu_failure only applies when gpu=True. "
"Pass gpu=True to enable the GPU pipeline, or drop "
"on_gpu_failure to keep the default CPU path.")
# VRT files (string paths only -- VRT XML references other files on disk)
if isinstance(source, str) and source.lower().endswith('.vrt'):
return read_vrt(source, dtype=dtype, window=window, band=band,
name=name, chunks=chunks, gpu=gpu,
max_pixels=max_pixels)
# File-like buffers don't support the GPU or dask code paths because
# those re-open the source by path from worker tasks or device-side
# readers. Reject early with a clear message.
if not isinstance(source, str):
if gpu:
raise ValueError(
"gpu=True is not supported for file-like sources. "
"Pass a path string instead.")
if chunks is not None:
raise ValueError(
"chunks=... (dask) is not supported for file-like sources. "
"Pass a path string instead.")
# GPU path
if gpu:
gpu_kwargs = {}
if on_gpu_failure is not _ON_GPU_FAILURE_SENTINEL:
gpu_kwargs['on_gpu_failure'] = on_gpu_failure
return read_geotiff_gpu(source, dtype=dtype,
overview_level=overview_level,
window=window, band=band,
name=name, chunks=chunks,
max_pixels=max_pixels,
**gpu_kwargs)
# Dask path (CPU)
if chunks is not None:
return read_geotiff_dask(source, dtype=dtype, chunks=chunks,
overview_level=overview_level,
window=window, band=band,
max_pixels=max_pixels, name=name)
kwargs = {}
if max_pixels is not None:
kwargs['max_pixels'] = max_pixels
# ``read_to_array`` validates ``window`` against the selected IFD's
# extent and raises ``ValueError`` for out-of-bounds windows with
# the same message format as the dask path's pre-flight validator
# in :func:`read_geotiff_dask`. That keeps the two backends in sync
# on the contract without forcing a second metadata parse here. See
# issue #1634.
arr, geo_info = read_to_array(
source, window=window,
overview_level=overview_level, band=band,
**kwargs,
)
height, width = arr.shape[:2]
coords = _geo_to_coords(geo_info, height, width)
if window is not None:
# Adjust coordinates for windowed read. ``read_to_array`` rejected
# out-of-bounds windows above, so ``r0/c0/r1/c1`` are guaranteed
# in-range here (no clamp needed).
r0, c0, r1, c1 = window
t = geo_info.transform
if geo_info.raster_type == RASTER_PIXEL_IS_POINT:
full_x = np.arange(c0, c1, dtype=np.float64) * t.pixel_width + t.origin_x
full_y = np.arange(r0, r1, dtype=np.float64) * t.pixel_height + t.origin_y
else:
full_x = np.arange(c0, c1, dtype=np.float64) * t.pixel_width + t.origin_x + t.pixel_width * 0.5
full_y = np.arange(r0, r1, dtype=np.float64) * t.pixel_height + t.origin_y + t.pixel_height * 0.5
coords = {'y': full_y, 'x': full_x}
if name is None:
# Derive from source path. File-like buffers don't have a path,
# so leave name unset rather than fabricating one.
if isinstance(source, str):
import os
name = os.path.splitext(os.path.basename(source))[0]
attrs = {}
_populate_attrs_from_geo_info(attrs, geo_info, window=window)
# Apply nodata mask: replace nodata sentinel values with NaN.
# ``arr`` came from ``read_to_array``, which returns a freshly
# allocated buffer from ``_read_tiles`` / ``_read_strips`` (and is
# ``np.ascontiguousarray``-wrapped if the orientation tag triggered
# a remap). Nothing else holds a reference here, so the in-place
# write is safe; an extra ``arr.copy()`` would just double peak
# memory for a multi-MB raster.
nodata = geo_info.nodata
if nodata is not None:
attrs['nodata'] = nodata
if arr.dtype.kind == 'f':
if not np.isnan(nodata):
arr[arr == arr.dtype.type(nodata)] = np.nan
elif arr.dtype.kind in ('u', 'i'):
# Integer arrays: convert to float to represent NaN.
# An out-of-range sentinel (e.g. uint16 file with
# GDAL_NODATA="-9999") cannot match any decoded pixel, so the
# mask would be all-False -- skip the cast that would otherwise
# raise OverflowError and leave the array unchanged.
nodata_int = int(nodata)
info = np.iinfo(arr.dtype)
if info.min <= nodata_int <= info.max:
mask = arr == arr.dtype.type(nodata_int)
if mask.any():
arr = arr.astype(np.float64)
arr[mask] = np.nan
if dtype is not None:
target = np.dtype(dtype)
_validate_dtype_cast(arr.dtype, target)
arr = arr.astype(target)
if arr.ndim == 3:
dims = ['y', 'x', 'band']
coords['band'] = np.arange(arr.shape[2])
else:
dims = ['y', 'x']
da = xr.DataArray(
arr,
dims=dims,
coords=coords,
name=name,
attrs=attrs,
)
return da
def _is_gpu_data(data) -> bool:
"""Check if data is CuPy-backed (raw array or DataArray)."""
try:
import cupy
_cupy_type = cupy.ndarray
except ImportError:
return False
if isinstance(data, xr.DataArray):
raw = data.data
if hasattr(raw, 'compute'):
meta = getattr(raw, '_meta', None)
return isinstance(meta, _cupy_type)
return isinstance(raw, _cupy_type)
return isinstance(data, _cupy_type)
def _apply_nodata_mask_gpu(arr_gpu, nodata):
"""Replace nodata sentinel values with NaN on a CuPy array.
Mirrors the CPU eager path in :func:`open_geotiff` (lines around the
``# Apply nodata mask`` comment). Float arrays get NaN substituted in
place of the sentinel; integer arrays are promoted to float64 with NaN
where the sentinel was. NaN nodata on a float array is a no-op (the
sentinel already matches NaN-aware code).
Returns the (possibly promoted, possibly nodata-masked) CuPy array.
The caller is responsible for setting ``attrs['nodata']`` so the
sentinel is still discoverable downstream.
"""
import cupy
if nodata is None:
return arr_gpu
arr_dtype = np.dtype(str(arr_gpu.dtype))
if arr_dtype.kind == 'f':
if not np.isnan(nodata):
sentinel = arr_dtype.type(nodata)
arr_gpu = cupy.where(arr_gpu == sentinel,
arr_dtype.type('nan'), arr_gpu)
return arr_gpu
if arr_dtype.kind in ('u', 'i'):
nodata_int = int(nodata)
info = np.iinfo(arr_dtype)
# Out-of-range sentinels (e.g. uint16 + GDAL_NODATA="-9999") cannot
# match any decoded pixel; skip the cast that would otherwise raise
# OverflowError. attrs['nodata'] is still set by the caller so the
# original sentinel survives a write round-trip.
if not (info.min <= nodata_int <= info.max):
return arr_gpu
sentinel = arr_dtype.type(nodata_int)
mask = arr_gpu == sentinel
if bool(mask.any().item()):
arr_gpu = arr_gpu.astype(cupy.float64)
arr_gpu = cupy.where(mask, cupy.float64('nan'), arr_gpu)
return arr_gpu
return arr_gpu
_LEVEL_RANGES = {
'deflate': (1, 9),
'zstd': (1, 22),
'lz4': (0, 16),
}
# Names accepted by ``compression=`` in :func:`to_geotiff`. Kept in sync with
# ``_compression_tag`` in ``_writer.py``. Validated up-front so users see a
# friendly error rather than the deeper traceback from ``_compression_tag``.
_VALID_COMPRESSIONS = (
'none', 'deflate', 'lzw', 'jpeg', 'packbits', 'zstd', 'lz4',
'jpeg2000', 'j2k', 'lerc',
)
# TIFF type ids needed when synthesizing extra_tags entries from attrs.
_TIFF_BYTE = 1
_TIFF_ASCII = 2
_TIFF_SHORT = 3
def _resolve_nodata_attr(attrs: dict):
"""Resolve a NoData sentinel from DataArray attrs.
xrspatial's own readers always emit ``attrs['nodata']`` (a scalar),
so that key is checked first for a clean intra-library round-trip.
Falls back to two ecosystem conventions on miss:
* ``attrs['nodatavals']`` -- rioxarray's per-band tuple. Returns
the first entry that is not None, not non-numeric, and not NaN.
In practice this is band 0 for almost every real file; the skip
logic only matters when band 0 is missing a sentinel (NaN /
None) while a later band declares one. Bands with mixed concrete
sentinels are uncommon and would need an explicit ``nodata=``
argument anyway.
* ``attrs['_FillValue']`` -- CF-style xarray pipelines.
Returns ``None`` when none of the keys carry a usable value. NaN
entries in ``nodatavals`` are skipped rather than treated as a
sentinel (NaN means "the float NaN is the sentinel", which is
already the default and doesn't need a GDAL_NODATA tag).
"""
nodata = attrs.get('nodata')
if nodata is not None:
return nodata
vals = attrs.get('nodatavals')
if vals is not None:
try:
seq = list(vals)
except TypeError:
seq = [vals]
for v in seq:
if v is None:
continue
try:
fv = float(v)
except (TypeError, ValueError):
continue
if np.isnan(fv):
continue
return v
fill = attrs.get('_FillValue')
if fill is not None:
try:
ffv = float(fill)
except (TypeError, ValueError):
return fill # non-numeric -- pass through verbatim
if np.isnan(ffv):
return None
return fill
return None
def _merge_friendly_extra_tags(extra_tags_list, attrs: dict) -> list | None:
"""Combine ``attrs['extra_tags']`` with friendly tag attrs.
Synthesizes ``(tag_id, type_id, count, value)`` entries from
``attrs['image_description']`` (270, ASCII),
``attrs['extra_samples']`` (338, SHORT) and ``attrs['colormap']``
(320, SHORT). An entry already present in ``extra_tags`` wins, so
a verbatim round-trip stays byte-identical.
"""
existing = list(extra_tags_list) if extra_tags_list else []
seen_ids = {t[0] for t in existing}
img_desc = attrs.get('image_description')
if img_desc is not None and 270 not in seen_ids:
s = str(img_desc)
existing.append((270, _TIFF_ASCII, len(s) + 1, s))
seen_ids.add(270)
extra_samples = attrs.get('extra_samples')
if extra_samples is not None and 338 not in seen_ids:
try:
vals = tuple(int(x) for x in extra_samples)
except (TypeError, ValueError):
vals = None
if vals:
value = vals if len(vals) > 1 else vals[0]
existing.append((338, _TIFF_SHORT, len(vals), value))
seen_ids.add(338)
colormap = attrs.get('colormap')
if colormap is not None and 320 not in seen_ids:
try:
cmap_vals = tuple(int(x) for x in colormap)
except (TypeError, ValueError):
cmap_vals = None
if cmap_vals:
value = cmap_vals if len(cmap_vals) > 1 else cmap_vals[0]
existing.append((320, _TIFF_SHORT, len(cmap_vals), value))
seen_ids.add(320)
return existing or None
# String identifiers (used in xrspatial attrs) -> TIFF ResolutionUnit tag ids.
_RESOLUTION_UNIT_IDS = {'none': 1, 'inch': 2, 'centimeter': 3}
def _extract_rich_tags(attrs: dict) -> dict:
"""Extract the rich-tag set forwarded by the writers to ``write(...)``.
Centralises the bookkeeping shared by :func:`to_geotiff`,
:func:`_write_vrt_tiled`, and :func:`write_geotiff_gpu`:
* ``raster_type`` -- mapped from ``attrs['raster_type']`` ('point'
becomes :data:`RASTER_PIXEL_IS_POINT`; everything else stays
:data:`RASTER_PIXEL_IS_AREA`).
* ``gdal_metadata_xml`` -- prefers ``attrs['gdal_metadata_xml']``;
falls back to building XML from ``attrs['gdal_metadata']`` when
it is a dict.
* ``extra_tags`` -- ``attrs['extra_tags']`` folded with the friendly
tag attrs (image_description / extra_samples / colormap) via
:func:`_merge_friendly_extra_tags`.
* ``x_resolution`` / ``y_resolution`` -- pass-through.
* ``resolution_unit`` -- string label mapped to the integer tag id.
Returns a kwargs dict ready to splat into ``write(...)``: every key
matches the corresponding parameter name on
:func:`xrspatial.geotiff._writer.write`.
"""
raster_type = (RASTER_PIXEL_IS_POINT
if attrs.get('raster_type') == 'point'
else RASTER_PIXEL_IS_AREA)
gdal_meta_xml = attrs.get('gdal_metadata_xml')
if gdal_meta_xml is None:
gdal_meta_dict = attrs.get('gdal_metadata')
if isinstance(gdal_meta_dict, dict):
from ._geotags import _build_gdal_metadata_xml
gdal_meta_xml = _build_gdal_metadata_xml(gdal_meta_dict)
extra_tags_list = _merge_friendly_extra_tags(
attrs.get('extra_tags'), attrs)
res_unit = None
unit_str = attrs.get('resolution_unit')
if unit_str is not None:
res_unit = _RESOLUTION_UNIT_IDS.get(str(unit_str), None)
return {
'raster_type': raster_type,
'gdal_metadata_xml': gdal_meta_xml,
'extra_tags': extra_tags_list,
'x_resolution': attrs.get('x_resolution'),
'y_resolution': attrs.get('y_resolution'),
'resolution_unit': res_unit,
}
def to_geotiff(data: xr.DataArray | np.ndarray,
path: str | BinaryIO, *,
crs: int | str | None = None,
nodata=None,
compression: str = 'zstd',
compression_level: int | None = None,
tiled: bool = True,
tile_size: int = 256,
predictor: bool | int = False,
cog: bool = False,
overview_levels: list[int] | None = None,
overview_resampling: str = 'mean',
bigtiff: bool | None = None,
gpu: bool | None = None,
streaming_buffer_bytes: int = 256 * 1024 * 1024,
max_z_error: float = 0.0) -> None:
"""Write data as a GeoTIFF or Cloud Optimized GeoTIFF.
Dask-backed DataArrays are written in streaming mode: one tile-row
at a time, without materialising the full array into RAM. Peak
memory is roughly ``tile_size * width * bytes_per_sample``. COG
output (``cog=True``) still materialises because overviews need the
full array.
Automatically dispatches to GPU compression when:
- ``gpu=True`` is passed, or
- The input data is CuPy-backed (auto-detected)
GPU write uses nvCOMP batch compression (deflate/ZSTD) and keeps
the array on device. Falls back to CPU if nvCOMP is not available.
Parameters
----------
data : xr.DataArray or np.ndarray
2D raster data.
path : str or binary file-like
Output file path, or any object exposing a ``write`` method
(e.g. ``io.BytesIO``). When a file-like is passed, the encoded
TIFF bytes are written to that object once assembly completes.
``cog=True`` and ``.vrt`` outputs require a string path.
crs : int, str, or None
EPSG code (int), WKT string, or PROJ string. If None and data
is a DataArray, tries to read from attrs ('crs' for EPSG,
'crs_wkt' for WKT).
nodata : float, int, or None
NoData value.
compression : str
Codec name. One of ``'none'``, ``'deflate'``, ``'lzw'``,
``'jpeg'``, ``'packbits'``, ``'zstd'``, ``'lz4'``,
``'jpeg2000'`` (alias ``'j2k'``), or ``'lerc'``.
``'jpeg'`` is currently rejected on write because the encoder
omits the JPEGTables tag and produced files do not round-trip