Skip to content

Commit 5c6d2ec

Browse files
committed
geotiff: activate mixed-band-metadata fail-closed check (#1987 PR 5)
Registers ``_check_read_mixed_band_metadata`` so VRT reads whose bands declare disagreeing per-band ``<NoDataValue>`` sentinels raise ``MixedBandMetadataError`` by default. The check and its ``band_nodata=`` opt-out were staged in #2031 but not registered: the activation needed the existing VRT test sites that mosaic distinct per-band sentinels to migrate to ``band_nodata='first'`` first. Wires the opt-out into ``open_geotiff`` and ``read_geotiff_dask`` so all three VRT entry points expose the same kwarg, with a clear ValueError on non-VRT sources (same dispatch-kwarg guard pattern as ``missing_sources`` / ``on_gpu_failure`` / ``max_cloud_bytes``). Migrates the four VRT test files whose fixtures intentionally mosaic bands with distinct sentinels: - ``test_vrt_band_nodata_1598.py`` - ``test_vrt_multiband_int_nodata_1611.py`` - ``test_vrt_int_source_float_dtype_1616.py`` - ``test_vrt_multiband_dtype_1696.py`` Each migrated call adds ``band_nodata='first'`` so the regression keeps asserting the legacy flatten-to-first-band semantics, now via the documented opt-out instead of implicitly. Adds ``test_mixed_band_metadata_fail_closed_1987.py`` with 11 cases pinning the fail-closed contract end-to-end: default raise, chunked raise, opt-out passthrough, shared-sentinel acceptance, one-band-only acceptance, ``open_geotiff`` / ``read_geotiff_dask`` propagation, and the non-VRT-source rejection on both entry points. Refs #1987.
1 parent 23ec22a commit 5c6d2ec

10 files changed

Lines changed: 367 additions & 27 deletions

xrspatial/geotiff/__init__.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,7 @@ def open_geotiff(source: str | BinaryIO, *,
254254
missing_sources: str = _MISSING_SOURCES_SENTINEL,
255255
allow_rotated: bool = False,
256256
allow_unparseable_crs: bool = False,
257+
band_nodata: str | None = None,
257258
mask_nodata: bool = True,
258259
) -> xr.DataArray:
259260
"""Read a GeoTIFF, COG, or VRT file into an xarray.DataArray.
@@ -325,6 +326,17 @@ def open_geotiff(source: str | BinaryIO, *,
325326
return a partial mosaic. Passing this kwarg with a non-VRT
326327
source raises ``ValueError`` because the policy only applies to
327328
the VRT pipeline. See ``read_vrt`` for the full description.
329+
band_nodata : {'first', None}, optional
330+
VRT-only. Opt-out for the fail-closed check that rejects VRT
331+
sources whose bands declare disagreeing per-band nodata
332+
sentinels (issue #1987 PR 5). When ``None`` (the default), a VRT
333+
that mosaics bands with different sentinels raises
334+
``MixedBandMetadataError``; flattening to one value would let
335+
one band's valid pixels collide with another band's sentinel.
336+
Pass ``band_nodata='first'`` to keep the legacy behaviour of
337+
using band 0's sentinel for the whole mosaic. Passing this
338+
kwarg with a non-VRT source raises ``ValueError`` because the
339+
policy only applies to the VRT pipeline.
328340
mask_nodata : bool, default True
329341
If True (the default), replace the nodata sentinel with ``NaN``;
330342
integer rasters get promoted to ``float64`` first so NaN can be
@@ -409,6 +421,19 @@ def open_geotiff(source: str | BinaryIO, *,
409421
"Pass a .vrt path to enable the VRT pipeline, or drop "
410422
"missing_sources to keep the default GeoTIFF path.")
411423

424+
# ``band_nodata`` is the #1987 PR 5 opt-out for the mixed-band
425+
# metadata fail-closed check. It only has meaning on the VRT pipeline
426+
# (a plain GeoTIFF has one nodata sentinel per file, not per band),
427+
# so reject the kwarg up front on non-VRT sources rather than letting
428+
# it leak into ``read_vrt`` and confuse the caller about what the
429+
# opt-out actually controls.
430+
if band_nodata is not None and not _is_vrt_source:
431+
raise ValueError(
432+
"band_nodata only applies to VRT sources. "
433+
"Pass a .vrt path to enable the VRT pipeline, or drop "
434+
"band_nodata to keep the default GeoTIFF path. "
435+
"See issue #1987.")
436+
412437
# ``max_cloud_bytes`` is the eager fsspec-read budget. Only
413438
# ``_read_to_array`` on the eager non-VRT, non-GPU, non-dask branch
414439
# consumes it; the GPU (``read_geotiff_gpu``), dask
@@ -473,6 +498,7 @@ def open_geotiff(source: str | BinaryIO, *,
473498
max_pixels=max_pixels,
474499
allow_rotated=allow_rotated,
475500
allow_unparseable_crs=allow_unparseable_crs,
501+
band_nodata=band_nodata,
476502
mask_nodata=mask_nodata,
477503
**vrt_kwargs)
478504

xrspatial/geotiff/_backends/dask.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ def read_geotiff_dask(source: str, *,
4141
max_pixels: int | None = None,
4242
allow_rotated: bool = False,
4343
allow_unparseable_crs: bool = False,
44+
band_nodata: str | None = None,
4445
mask_nodata: bool = True) -> xr.DataArray:
4546
"""Read a GeoTIFF as a dask-backed DataArray for out-of-core processing.
4647
@@ -74,6 +75,11 @@ def read_geotiff_dask(source: str, *,
7475
directly.
7576
name : str or None
7677
Name for the DataArray.
78+
band_nodata : {'first', None}, optional
79+
VRT-only opt-out for the fail-closed mixed-band-metadata check
80+
(issue #1987 PR 5). Forwarded verbatim to ``read_vrt`` when the
81+
source is a ``.vrt`` file. Passing it with a non-VRT GeoTIFF
82+
source raises ``ValueError``.
7783
mask_nodata : bool, default True
7884
If True, replace the nodata sentinel with NaN per chunk (integer
7985
rasters get promoted to ``float64``). If False, skip the
@@ -114,8 +120,21 @@ def read_geotiff_dask(source: str, *,
114120
return read_vrt(
115121
source, dtype=dtype, window=window, band=band, name=name,
116122
chunks=chunks, max_pixels=max_pixels,
123+
allow_rotated=allow_rotated,
124+
allow_unparseable_crs=allow_unparseable_crs,
125+
band_nodata=band_nodata,
117126
mask_nodata=mask_nodata,
118127
)
128+
# ``band_nodata`` only has meaning for the VRT path (per-band sentinel
129+
# ambiguity). Reject the kwarg up front on non-VRT GeoTIFF inputs so
130+
# callers learn the opt-out is being dropped, matching the
131+
# ``open_geotiff`` guard. See issue #1987 PR 5.
132+
if band_nodata is not None:
133+
raise ValueError(
134+
"band_nodata only applies to VRT sources. "
135+
"Pass a .vrt path to enable the VRT pipeline, or drop "
136+
"band_nodata to keep the default GeoTIFF path. "
137+
"See issue #1987.")
119138

120139
# P5: HTTP COG sources used to fire one IFD/header GET per chunk
121140
# task. Parse metadata once here so every delayed task can reuse it.

xrspatial/geotiff/_validation.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -950,11 +950,13 @@ def _same_nodata(a: float, b: float) -> bool:
950950
return a == b
951951

952952

953-
# NOT registered by default. The mixed-band check has a much larger
954-
# migration cost than its sibling read-side checks (around 35 existing
955-
# test sites would need to opt in via ``band_nodata='first'`` to keep
956-
# their legacy assertions), so it lands as a follow-up PR that bundles
957-
# the check activation with the test migration. The check itself and
958-
# the ``band_nodata=`` VRT kwarg ship now so the follow-up is a
959-
# one-line registration call plus the test sweep.
960-
# register_read_metadata_check(_check_read_mixed_band_metadata)
953+
# Registered as of issue #1987 PR 5. The check was staged in the
954+
# preceding bundle (#2031) along with the ``band_nodata=`` VRT kwarg
955+
# but not registered, because activation needed a coordinated migration
956+
# of the VRT test sites that read fixtures with disagreeing per-band
957+
# sentinels. The migration sweep ships alongside this registration.
958+
# Callers that still want the legacy flatten-to-first-band behaviour
959+
# pass ``band_nodata='first'`` to ``read_vrt`` / ``open_geotiff`` /
960+
# ``read_geotiff_dask``; the explicit opt-in surfaces the per-band
961+
# ambiguity at the call site instead of papering over it silently.
962+
register_read_metadata_check(_check_read_mixed_band_metadata)
Lines changed: 270 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,270 @@
1+
"""Fail-closed coverage for ``MixedBandMetadataError`` (issue #1987 PR 5).
2+
3+
A VRT can mosaic source bands that declare disagreeing per-band
4+
``<NoDataValue>`` sentinels. The legacy reader picked band 0's sentinel
5+
for the whole mosaic, which let band N's valid pixels collide with
6+
band 0's sentinel after the flatten-to-scalar step. The fail-closed
7+
default refuses the ambiguity; ``band_nodata='first'`` keeps the legacy
8+
behaviour explicitly.
9+
10+
The validator check itself, the ``band_nodata=`` kwarg on
11+
``read_vrt``, and the per-VRT-read context plumbing landed in
12+
the #2031 bundle. This PR registers the check and forwards
13+
``band_nodata`` through ``open_geotiff`` and ``read_geotiff_dask``.
14+
The tests below pin the four entry-point contracts:
15+
16+
* default ``read_vrt`` raises on disagreeing per-band sentinels
17+
* default ``read_vrt(chunks=...)`` raises before any task fires
18+
* ``band_nodata='first'`` opts back into the legacy reader
19+
* ``band_nodata`` raises on non-VRT sources for ``open_geotiff`` and
20+
``read_geotiff_dask``
21+
"""
22+
from __future__ import annotations
23+
24+
import os
25+
26+
import numpy as np
27+
import pytest
28+
29+
from xrspatial.geotiff import (
30+
GeoTIFFAmbiguousMetadataError,
31+
MixedBandMetadataError,
32+
open_geotiff,
33+
read_geotiff_dask,
34+
read_vrt,
35+
to_geotiff,
36+
)
37+
from xrspatial.geotiff._writer import write
38+
39+
40+
def _write_mixed_band_vrt(tmp_path, *, sentinel_a=65535, sentinel_b=65000):
41+
"""Two uint16 sources with distinct sentinels, mosaiced into one VRT.
42+
43+
``[1, 1]`` is the sentinel pixel in each source so the legacy
44+
reader's flatten-to-first-band step would mis-mask band 1.
45+
"""
46+
band0 = np.array([[1, 2], [3, sentinel_a]], dtype=np.uint16)
47+
band1 = np.array([[7, 8], [9, sentinel_b]], dtype=np.uint16)
48+
p0 = str(tmp_path / 'mixed_band_1987_a.tif')
49+
p1 = str(tmp_path / 'mixed_band_1987_b.tif')
50+
write(band0, p0, nodata=sentinel_a, compression='none', tiled=False)
51+
write(band1, p1, nodata=sentinel_b, compression='none', tiled=False)
52+
vrt_path = str(tmp_path / 'mixed_band_1987.vrt')
53+
vrt_xml = f"""<VRTDataset rasterXSize="2" rasterYSize="2">
54+
<GeoTransform>0.0, 1.0, 0.0, 0.0, 0.0, -1.0</GeoTransform>
55+
<VRTRasterBand dataType="UInt16" band="1">
56+
<NoDataValue>{sentinel_a}</NoDataValue>
57+
<SimpleSource>
58+
<SourceFilename relativeToVRT="0">{p0}</SourceFilename>
59+
<SourceBand>1</SourceBand>
60+
<SrcRect xOff="0" yOff="0" xSize="2" ySize="2"/>
61+
<DstRect xOff="0" yOff="0" xSize="2" ySize="2"/>
62+
</SimpleSource>
63+
</VRTRasterBand>
64+
<VRTRasterBand dataType="UInt16" band="2">
65+
<NoDataValue>{sentinel_b}</NoDataValue>
66+
<SimpleSource>
67+
<SourceFilename relativeToVRT="0">{p1}</SourceFilename>
68+
<SourceBand>1</SourceBand>
69+
<SrcRect xOff="0" yOff="0" xSize="2" ySize="2"/>
70+
<DstRect xOff="0" yOff="0" xSize="2" ySize="2"/>
71+
</SimpleSource>
72+
</VRTRasterBand>
73+
</VRTDataset>"""
74+
with open(vrt_path, 'w') as f:
75+
f.write(vrt_xml)
76+
return vrt_path
77+
78+
79+
def _write_shared_sentinel_vrt(tmp_path, *, sentinel=65535):
80+
"""Two uint16 sources mosaiced with the *same* per-band sentinel.
81+
82+
The fail-closed check must not trigger on a VRT whose per-band
83+
nodata agrees; the check is about ambiguity, not about per-band
84+
declarations.
85+
"""
86+
band0 = np.array([[1, 2], [3, sentinel]], dtype=np.uint16)
87+
band1 = np.array([[7, 8], [9, sentinel]], dtype=np.uint16)
88+
p0 = str(tmp_path / 'shared_band_1987_a.tif')
89+
p1 = str(tmp_path / 'shared_band_1987_b.tif')
90+
write(band0, p0, nodata=sentinel, compression='none', tiled=False)
91+
write(band1, p1, nodata=sentinel, compression='none', tiled=False)
92+
vrt_path = str(tmp_path / 'shared_band_1987.vrt')
93+
vrt_xml = f"""<VRTDataset rasterXSize="2" rasterYSize="2">
94+
<GeoTransform>0.0, 1.0, 0.0, 0.0, 0.0, -1.0</GeoTransform>
95+
<VRTRasterBand dataType="UInt16" band="1">
96+
<NoDataValue>{sentinel}</NoDataValue>
97+
<SimpleSource>
98+
<SourceFilename relativeToVRT="0">{p0}</SourceFilename>
99+
<SourceBand>1</SourceBand>
100+
<SrcRect xOff="0" yOff="0" xSize="2" ySize="2"/>
101+
<DstRect xOff="0" yOff="0" xSize="2" ySize="2"/>
102+
</SimpleSource>
103+
</VRTRasterBand>
104+
<VRTRasterBand dataType="UInt16" band="2">
105+
<NoDataValue>{sentinel}</NoDataValue>
106+
<SimpleSource>
107+
<SourceFilename relativeToVRT="0">{p1}</SourceFilename>
108+
<SourceBand>1</SourceBand>
109+
<SrcRect xOff="0" yOff="0" xSize="2" ySize="2"/>
110+
<DstRect xOff="0" yOff="0" xSize="2" ySize="2"/>
111+
</SimpleSource>
112+
</VRTRasterBand>
113+
</VRTDataset>"""
114+
with open(vrt_path, 'w') as f:
115+
f.write(vrt_xml)
116+
return vrt_path
117+
118+
119+
def _write_one_band_no_sentinel_vrt(tmp_path, *, sentinel=65535):
120+
"""One band declares a sentinel, the other does not.
121+
122+
``[b.nodata for b in vrt.bands]`` is ``[sentinel, None]``. The
123+
check considers ``None`` entries "no declaration" rather than a
124+
distinct sentinel, so only one concrete value is present and the
125+
check must accept.
126+
"""
127+
band0 = np.array([[1, 2], [3, sentinel]], dtype=np.uint16)
128+
band1 = np.array([[7, 8], [9, 99]], dtype=np.uint16)
129+
p0 = str(tmp_path / 'one_band_sentinel_a.tif')
130+
p1 = str(tmp_path / 'one_band_sentinel_b.tif')
131+
write(band0, p0, nodata=sentinel, compression='none', tiled=False)
132+
write(band1, p1, nodata=None, compression='none', tiled=False)
133+
vrt_path = str(tmp_path / 'one_band_sentinel.vrt')
134+
vrt_xml = f"""<VRTDataset rasterXSize="2" rasterYSize="2">
135+
<GeoTransform>0.0, 1.0, 0.0, 0.0, 0.0, -1.0</GeoTransform>
136+
<VRTRasterBand dataType="UInt16" band="1">
137+
<NoDataValue>{sentinel}</NoDataValue>
138+
<SimpleSource>
139+
<SourceFilename relativeToVRT="0">{p0}</SourceFilename>
140+
<SourceBand>1</SourceBand>
141+
<SrcRect xOff="0" yOff="0" xSize="2" ySize="2"/>
142+
<DstRect xOff="0" yOff="0" xSize="2" ySize="2"/>
143+
</SimpleSource>
144+
</VRTRasterBand>
145+
<VRTRasterBand dataType="UInt16" band="2">
146+
<SimpleSource>
147+
<SourceFilename relativeToVRT="0">{p1}</SourceFilename>
148+
<SourceBand>1</SourceBand>
149+
<SrcRect xOff="0" yOff="0" xSize="2" ySize="2"/>
150+
<DstRect xOff="0" yOff="0" xSize="2" ySize="2"/>
151+
</SimpleSource>
152+
</VRTRasterBand>
153+
</VRTDataset>"""
154+
with open(vrt_path, 'w') as f:
155+
f.write(vrt_xml)
156+
return vrt_path
157+
158+
159+
def test_read_vrt_rejects_mixed_per_band_nodata(tmp_path):
160+
"""Default ``read_vrt`` raises on disagreeing per-band sentinels."""
161+
vrt_path = _write_mixed_band_vrt(tmp_path)
162+
with pytest.raises(MixedBandMetadataError) as exc_info:
163+
read_vrt(vrt_path)
164+
msg = str(exc_info.value)
165+
assert "65535" in msg and "65000" in msg
166+
assert "band_nodata='first'" in msg
167+
assert "#1987" in msg
168+
169+
170+
def test_read_vrt_chunked_rejects_mixed_per_band_nodata(tmp_path):
171+
"""The chunked dispatch validates before scheduling any task."""
172+
vrt_path = _write_mixed_band_vrt(tmp_path)
173+
with pytest.raises(MixedBandMetadataError):
174+
read_vrt(vrt_path, chunks=1)
175+
176+
177+
def test_read_vrt_band_nodata_first_opts_back_in(tmp_path):
178+
"""``band_nodata='first'`` keeps the legacy flatten-to-first path."""
179+
vrt_path = _write_mixed_band_vrt(tmp_path)
180+
r = read_vrt(vrt_path, band_nodata='first')
181+
# Legacy semantics: band 0's sentinel becomes ``attrs['nodata']``.
182+
assert r.attrs.get('nodata') == 65535.0
183+
184+
185+
def test_read_vrt_shared_sentinel_accepts(tmp_path):
186+
"""A VRT whose bands agree on a single sentinel is not ambiguous."""
187+
vrt_path = _write_shared_sentinel_vrt(tmp_path)
188+
r = read_vrt(vrt_path)
189+
assert r.attrs.get('nodata') == 65535.0
190+
191+
192+
def test_read_vrt_only_one_band_declares_sentinel_accepts(tmp_path):
193+
"""``None`` per-band entries count as "no declaration", not a value.
194+
195+
A VRT with one declared sentinel and one undeclared band has only
196+
one concrete sentinel so the fail-closed check must accept.
197+
"""
198+
vrt_path = _write_one_band_no_sentinel_vrt(tmp_path)
199+
# Should not raise.
200+
read_vrt(vrt_path)
201+
202+
203+
def test_open_geotiff_propagates_mixed_band_rejection(tmp_path):
204+
"""``open_geotiff`` on a ``.vrt`` source routes through ``read_vrt``."""
205+
vrt_path = _write_mixed_band_vrt(tmp_path)
206+
with pytest.raises(MixedBandMetadataError):
207+
open_geotiff(vrt_path)
208+
209+
210+
def test_open_geotiff_band_nodata_first_passes_through(tmp_path):
211+
"""``open_geotiff(band_nodata='first')`` forwards to ``read_vrt``."""
212+
vrt_path = _write_mixed_band_vrt(tmp_path)
213+
r = open_geotiff(vrt_path, band_nodata='first')
214+
assert r.attrs.get('nodata') == 65535.0
215+
216+
217+
def test_read_geotiff_dask_band_nodata_first_passes_through(tmp_path):
218+
"""``read_geotiff_dask`` on a ``.vrt`` source forwards the kwarg."""
219+
vrt_path = _write_mixed_band_vrt(tmp_path)
220+
r = read_geotiff_dask(vrt_path, chunks=1, band_nodata='first')
221+
assert r.attrs.get('nodata') == 65535.0
222+
223+
224+
def test_open_geotiff_band_nodata_rejected_on_non_vrt_source(tmp_path):
225+
"""``band_nodata`` is VRT-only; reject up front on plain GeoTIFFs.
226+
227+
Same dispatch-kwarg pattern as ``missing_sources``, ``on_gpu_failure``,
228+
and ``max_cloud_bytes``: silently dropping a backend-specific kwarg
229+
is the bug class #1561 / #1605 / #1685 / #1795 / #1810 / #1974 closed
230+
on the rest of the surface.
231+
"""
232+
arr = np.zeros((2, 2), dtype=np.uint16)
233+
arr_da = _wrap_2d(arr)
234+
p = tmp_path / 'plain.tif'
235+
to_geotiff(arr_da, str(p), compression='none', tiled=False)
236+
with pytest.raises(ValueError, match="band_nodata only applies to VRT"):
237+
open_geotiff(str(p), band_nodata='first')
238+
239+
240+
def test_read_geotiff_dask_band_nodata_rejected_on_non_vrt_source(tmp_path):
241+
"""``read_geotiff_dask`` matches ``open_geotiff``'s VRT-only guard."""
242+
arr = np.zeros((2, 2), dtype=np.uint16)
243+
arr_da = _wrap_2d(arr)
244+
p = tmp_path / 'plain.tif'
245+
to_geotiff(arr_da, str(p), compression='none', tiled=False)
246+
with pytest.raises(ValueError, match="band_nodata only applies to VRT"):
247+
read_geotiff_dask(str(p), chunks=1, band_nodata='first')
248+
249+
250+
def test_mixed_band_metadata_error_subclasses_base(tmp_path):
251+
"""``MixedBandMetadataError`` is catchable via the base class."""
252+
vrt_path = _write_mixed_band_vrt(tmp_path)
253+
with pytest.raises(GeoTIFFAmbiguousMetadataError):
254+
read_vrt(vrt_path)
255+
256+
257+
def _wrap_2d(arr):
258+
"""Wrap a 2D numpy array as a minimal DataArray for ``to_geotiff``."""
259+
import xarray as xr
260+
h, w = arr.shape
261+
return xr.DataArray(
262+
arr,
263+
dims=('y', 'x'),
264+
coords={
265+
'y': np.arange(h, dtype=np.float64),
266+
'x': np.arange(w, dtype=np.float64),
267+
},
268+
attrs={'crs': 4326,
269+
'transform': (1.0, 0.0, 0.0, 0.0, -1.0, 0.0)},
270+
)

xrspatial/geotiff/tests/test_reader_kwarg_order_1935.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
"missing_sources",
4545
"allow_rotated",
4646
"allow_unparseable_crs",
47+
"band_nodata",
4748
"mask_nodata",
4849
)
4950

0 commit comments

Comments
 (0)