Skip to content

Commit 105054d

Browse files
committed
Fix read_vrt(band=None) leaving bands 1+ sentinels unmasked (#1611)
read_vrt on a multi-band integer VRT with per-band <NoDataValue> tags applied vrt.bands[0].nodata across the full 3-D array. Bands 1+ kept their integer sentinels as literal finite values in the returned float64 array, breaking the convention that attrs['nodata'] is present iff sentinel pixels have already been NaN-masked. The float-VRT path masks per-band correctly inline in _vrt._read_data (lines 296-297 and 347-351). PR #1602 fixed the band=N single-band selection. This change extends the per-band masking to the band=None multi-band path: when ndim == 3 and band is None, walk vrt.bands and mask each arr[..., i] slice against its own <NoDataValue>. Masks are collected against the integer array first, then promotion to float64 happens once if any band actually hit. Gating reuses the same range check that PR #1583 added to other read paths (refactored into _sentinel_for_dtype): out-of-range, non-finite, and fractional sentinels become a no-op for value matching rather than raising OverflowError. attrs['nodata'] for band=None reads still carries band 0's sentinel - that's the documented "first band wins" contract for the canonical attr that cannot encode per-band values. 7 regression tests in test_vrt_multiband_int_nodata_1611.py cover the uint16 per-band case, int32 negative sentinels, mixed presence, no-sentinel-anywhere dtype preservation, out-of-range gating, the band=N non-regression, and the attrs contract. All 135 existing vrt/nodata geotiff tests still pass.
1 parent 7da3e0c commit 105054d

2 files changed

Lines changed: 284 additions & 14 deletions

File tree

xrspatial/geotiff/__init__.py

Lines changed: 54 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2792,21 +2792,61 @@ def read_vrt(source: str, *, dtype=None, window=None,
27922792
# carried the literal sentinel value, breaking the convention that
27932793
# downstream code follows (``attrs['nodata']`` is present iff the
27942794
# array has already been NaN-masked).
2795-
if nodata is not None and arr.dtype.kind in ('u', 'i'):
2796-
# VRT nodata is parsed as float, so reject fractional, non-finite, or
2797-
# out-of-dtype-range values rather than truncate/wrap them into a
2798-
# sentinel that could mask real pixels.
2799-
nodata_f = float(nodata)
2800-
info = np.iinfo(arr.dtype)
2801-
if (
2802-
np.isfinite(nodata_f)
2803-
and nodata_f.is_integer()
2804-
and info.min <= nodata_f <= info.max
2805-
):
2806-
mask = arr == arr.dtype.type(int(nodata_f))
2807-
if mask.any():
2795+
#
2796+
# For multi-band reads (``band is None`` and ``arr.ndim == 3``), each
2797+
# band can declare its own ``<NoDataValue>``. The float-VRT path masks
2798+
# per-band inline in ``_vrt._read_data``; mirror that here by walking
2799+
# ``vrt.bands`` and masking each ``arr[..., i]`` slice against its own
2800+
# sentinel. Before this branch, only band 0's sentinel was applied and
2801+
# bands 1+ left their integer sentinels as literal finite values in
2802+
# the returned float64 array. See issue #1611.
2803+
def _sentinel_for_dtype(nodata_val, dtype):
2804+
"""Return ``dtype``-cast sentinel for ``nodata_val`` or ``None``
2805+
if the value can't be represented in ``dtype`` (non-integer
2806+
dtype, out-of-range, non-finite, or fractional). Mirrors the
2807+
gating PR #1583 added to other read paths via
2808+
``_int_nodata_in_range``.
2809+
"""
2810+
if nodata_val is None or dtype.kind not in ('u', 'i'):
2811+
return None
2812+
try:
2813+
nodata_f = float(nodata_val)
2814+
except (TypeError, ValueError):
2815+
return None
2816+
info = np.iinfo(dtype)
2817+
if not (np.isfinite(nodata_f) and nodata_f.is_integer()
2818+
and info.min <= nodata_f <= info.max):
2819+
return None
2820+
return dtype.type(int(nodata_f))
2821+
2822+
if arr.dtype.kind in ('u', 'i'):
2823+
if arr.ndim == 3 and band is None and vrt.bands:
2824+
# Per-band masking: compute every band's mask against the
2825+
# original integer array first, then promote once and write
2826+
# NaN into the float64 view. Doing the compare phase before
2827+
# any mutation avoids issues with mixed dtype views.
2828+
int_dtype = arr.dtype
2829+
band_masks = []
2830+
for i, vrt_band in enumerate(vrt.bands):
2831+
if i >= arr.shape[-1]:
2832+
break
2833+
sentinel = _sentinel_for_dtype(vrt_band.nodata, int_dtype)
2834+
if sentinel is None:
2835+
continue
2836+
mask = arr[..., i] == sentinel
2837+
if mask.any():
2838+
band_masks.append((i, mask))
2839+
if band_masks:
28082840
arr = arr.astype(np.float64)
2809-
arr[mask] = np.nan
2841+
for i, mask in band_masks:
2842+
arr[..., i][mask] = np.nan
2843+
elif nodata is not None:
2844+
sentinel = _sentinel_for_dtype(nodata, arr.dtype)
2845+
if sentinel is not None:
2846+
mask = arr == sentinel
2847+
if mask.any():
2848+
arr = arr.astype(np.float64)
2849+
arr[mask] = np.nan
28102850

28112851
# Surface the source GeoTransform in the same rasterio ordering used
28122852
# by open_geotiff: (pixel_width, 0, origin_x, 0, pixel_height, origin_y).
Lines changed: 230 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,230 @@
1+
"""Regression tests for issue #1611.
2+
3+
``read_vrt(path)`` on a multi-band integer VRT with per-band
4+
``<NoDataValue>`` tags used to only mask band 0's sentinel. Bands 1
5+
and up retained their integer sentinels as literal finite values in
6+
the returned float64 array, breaking the convention that
7+
``attrs['nodata']`` is present iff sentinel pixels are already NaN.
8+
9+
The float-VRT path masks per-band correctly in ``_vrt._read_data``
10+
(lines 296-297, 347-351). For integer rasters the masking moved to
11+
``__init__.py:read_vrt`` and used ``vrt.bands[0].nodata`` for every
12+
band. The fix walks ``vrt.bands`` when ``band is None`` and masks
13+
each ``arr[..., i]`` slice against its own ``<NoDataValue>``.
14+
15+
This file mirrors test_vrt_band_nodata_1598.py for the multi-band
16+
``band=None`` path. PR #1602 fixed ``band=N`` single-band selection.
17+
"""
18+
from __future__ import annotations
19+
20+
import numpy as np
21+
import pytest
22+
23+
from xrspatial.geotiff import read_vrt
24+
from xrspatial.geotiff._writer import write
25+
26+
27+
def _write_two_band_per_band_nodata_vrt(tmp_path, *, dtype_str="UInt16",
28+
np_dtype=np.uint16,
29+
band0_sentinel=65535,
30+
band1_sentinel=65000,
31+
band0_other=(1, 2, 3),
32+
band1_other=(7, 8, 9)):
33+
"""Two single-band integer sources, each with a distinct nodata
34+
sentinel, exposed as bands 1 and 2 of a hand-rolled VRT.
35+
36+
Used to be band 0's sentinel was the only one masked. Now every
37+
band gets its own sentinel.
38+
"""
39+
b0_arr = np.array([[band0_other[0], band0_other[1]],
40+
[band0_other[2], band0_sentinel]], dtype=np_dtype)
41+
b1_arr = np.array([[band1_other[0], band1_other[1]],
42+
[band1_other[2], band1_sentinel]], dtype=np_dtype)
43+
p0 = str(tmp_path / 'vrt_b0_1611.tif')
44+
p1 = str(tmp_path / 'vrt_b1_1611.tif')
45+
write(b0_arr, p0, nodata=band0_sentinel, compression='none',
46+
tiled=False)
47+
write(b1_arr, p1, nodata=band1_sentinel, compression='none',
48+
tiled=False)
49+
50+
vrt_path = str(tmp_path / 'two_band_per_band_nodata_1611.vrt')
51+
vrt_xml = f"""<VRTDataset rasterXSize="2" rasterYSize="2">
52+
<GeoTransform>0.0, 1.0, 0.0, 0.0, 0.0, -1.0</GeoTransform>
53+
<VRTRasterBand dataType="{dtype_str}" band="1">
54+
<NoDataValue>{band0_sentinel}</NoDataValue>
55+
<SimpleSource>
56+
<SourceFilename relativeToVRT="0">{p0}</SourceFilename>
57+
<SourceBand>1</SourceBand>
58+
<SrcRect xOff="0" yOff="0" xSize="2" ySize="2"/>
59+
<DstRect xOff="0" yOff="0" xSize="2" ySize="2"/>
60+
</SimpleSource>
61+
</VRTRasterBand>
62+
<VRTRasterBand dataType="{dtype_str}" band="2">
63+
<NoDataValue>{band1_sentinel}</NoDataValue>
64+
<SimpleSource>
65+
<SourceFilename relativeToVRT="0">{p1}</SourceFilename>
66+
<SourceBand>1</SourceBand>
67+
<SrcRect xOff="0" yOff="0" xSize="2" ySize="2"/>
68+
<DstRect xOff="0" yOff="0" xSize="2" ySize="2"/>
69+
</SimpleSource>
70+
</VRTRasterBand>
71+
</VRTDataset>"""
72+
with open(vrt_path, 'w') as f:
73+
f.write(vrt_xml)
74+
return vrt_path
75+
76+
77+
def test_multiband_uint16_per_band_sentinel_each_masked(tmp_path):
78+
"""The previously-broken case: every band's sentinel must be NaN.
79+
80+
Before the fix this returned dtype=float64 with band 0's (1,1) cell
81+
as NaN but band 1's (1,1) cell as the literal 65000.0.
82+
"""
83+
vrt_path = _write_two_band_per_band_nodata_vrt(tmp_path)
84+
r = read_vrt(vrt_path)
85+
assert r.shape == (2, 2, 2)
86+
assert r.dtype == np.float64, (
87+
f"expected float64 promotion, got {r.dtype}"
88+
)
89+
assert np.isnan(r.values[1, 1, 0]), (
90+
"band 0's sentinel pixel was not NaN-masked."
91+
)
92+
assert np.isnan(r.values[1, 1, 1]), (
93+
"band 1's sentinel pixel was not NaN-masked; "
94+
"the regression from issue #1611 has returned."
95+
)
96+
# Non-sentinel pixels survive unchanged
97+
assert r.values[0, 0, 0] == 1
98+
assert r.values[0, 0, 1] == 7
99+
assert r.values[1, 0, 0] == 3
100+
assert r.values[1, 0, 1] == 9
101+
102+
103+
def test_multiband_int32_negative_per_band_sentinel(tmp_path):
104+
"""Negative sentinels in a signed integer VRT also mask per-band.
105+
106+
The original bug was dtype-independent: any integer dtype with
107+
per-band <NoDataValue> would have hit it. Cover int32 + negative
108+
sentinels to make sure the helper handles signed types and the
109+
range guard accepts negatives.
110+
"""
111+
vrt_path = _write_two_band_per_band_nodata_vrt(
112+
tmp_path, dtype_str="Int32", np_dtype=np.int32,
113+
band0_sentinel=-9999, band1_sentinel=-7777,
114+
band0_other=(10, 20, 30), band1_other=(40, 50, 60))
115+
r = read_vrt(vrt_path)
116+
assert r.dtype == np.float64
117+
assert np.isnan(r.values[1, 1, 0])
118+
assert np.isnan(r.values[1, 1, 1])
119+
assert r.values[0, 0, 0] == 10
120+
assert r.values[0, 0, 1] == 40
121+
122+
123+
def test_multiband_only_one_band_has_sentinel_present(tmp_path):
124+
"""If only one band's sentinel actually appears in the data, only
125+
that band should change. The non-hitting band stays the same float64
126+
value (no spurious NaN introduced).
127+
128+
Force band 1's sentinel never to appear by writing 99 instead.
129+
"""
130+
vrt_path = _write_two_band_per_band_nodata_vrt(
131+
tmp_path,
132+
band0_sentinel=65535, band1_sentinel=65000,
133+
band1_other=(7, 8, 9))
134+
# Overwrite the band 1 source so the sentinel value 65000 is NOT
135+
# present in band 1's actual data.
136+
b1_no_sentinel = np.array([[7, 8], [9, 99]], dtype=np.uint16)
137+
import os
138+
p1 = os.path.join(os.path.dirname(vrt_path), 'vrt_b1_1611.tif')
139+
write(b1_no_sentinel, p1, nodata=65000, compression='none',
140+
tiled=False)
141+
142+
r = read_vrt(vrt_path)
143+
assert r.dtype == np.float64, (
144+
"Even when only band 0 has a present sentinel, the array still "
145+
"needs promotion so band 0's NaN can be expressed."
146+
)
147+
assert np.isnan(r.values[1, 1, 0])
148+
assert r.values[1, 1, 1] == 99.0 # band 1 sentinel absent, no NaN
149+
150+
151+
def test_multiband_no_sentinel_present_anywhere_keeps_int_dtype(tmp_path):
152+
"""When no band actually contains its declared sentinel, skip
153+
promotion entirely. Avoids a needless float64 cast on integer data.
154+
"""
155+
vrt_path = _write_two_band_per_band_nodata_vrt(
156+
tmp_path,
157+
band0_sentinel=65535, band1_sentinel=65000,
158+
band0_other=(1, 2, 3), band1_other=(7, 8, 9))
159+
# Replace BOTH source files so neither contains its sentinel
160+
import os
161+
b0 = np.array([[1, 2], [3, 4]], dtype=np.uint16)
162+
b1 = np.array([[7, 8], [9, 10]], dtype=np.uint16)
163+
p0 = os.path.join(os.path.dirname(vrt_path), 'vrt_b0_1611.tif')
164+
p1 = os.path.join(os.path.dirname(vrt_path), 'vrt_b1_1611.tif')
165+
write(b0, p0, nodata=65535, compression='none', tiled=False)
166+
write(b1, p1, nodata=65000, compression='none', tiled=False)
167+
168+
r = read_vrt(vrt_path)
169+
# Sentinels not present -> integer dtype preserved (matches the
170+
# eager open_geotiff fast-path which also skips promotion when the
171+
# mask is empty).
172+
assert r.dtype == np.uint16
173+
assert r.values[1, 1, 0] == 4
174+
assert r.values[1, 1, 1] == 10
175+
176+
177+
def test_multiband_per_band_out_of_range_sentinel_is_no_op(tmp_path):
178+
"""A sentinel out of the integer dtype's range should be a no-op
179+
for that band rather than raising. Mirrors PR #1583's behaviour
180+
(#1581): the helper ``_int_nodata_in_range`` gates the cast.
181+
"""
182+
# uint16 cannot represent -9999; the helper should skip band 1.
183+
vrt_path = _write_two_band_per_band_nodata_vrt(
184+
tmp_path,
185+
dtype_str="UInt16", np_dtype=np.uint16,
186+
band0_sentinel=65535, band1_sentinel=10, # placeholder; rewrite below
187+
band0_other=(1, 2, 3), band1_other=(7, 8, 9))
188+
189+
# Hand-rewrite the VRT so band 1's NoDataValue is the out-of-range -9999.
190+
with open(vrt_path) as f:
191+
xml = f.read()
192+
xml = xml.replace("<NoDataValue>10</NoDataValue>",
193+
"<NoDataValue>-9999</NoDataValue>")
194+
with open(vrt_path, 'w') as f:
195+
f.write(xml)
196+
197+
# Should not raise and should still mask band 0.
198+
r = read_vrt(vrt_path)
199+
assert np.isnan(r.values[1, 1, 0]) # band 0 sentinel still masked
200+
# Band 1's sentinel (-9999) is out of uint16 range; the value 10
201+
# in band1[1,1] survives unchanged.
202+
assert r.values[1, 1, 1] == 10.0 or r.values[1, 1, 1] == 10
203+
204+
205+
def test_multiband_band_kwarg_still_per_band_post_pr1602(tmp_path):
206+
"""Non-regression check that PR #1602's band=N path still works.
207+
208+
The fix here only changes the ``band is None`` branch; ``band=N``
209+
must still route through the single-band masking with its own
210+
sentinel.
211+
"""
212+
vrt_path = _write_two_band_per_band_nodata_vrt(tmp_path)
213+
r0 = read_vrt(vrt_path, band=0)
214+
r1 = read_vrt(vrt_path, band=1)
215+
assert r0.dtype == np.float64
216+
assert r1.dtype == np.float64
217+
assert r0.attrs.get('nodata') == 65535.0
218+
assert r1.attrs.get('nodata') == 65000.0
219+
assert np.isnan(r0.values[1, 1])
220+
assert np.isnan(r1.values[1, 1])
221+
222+
223+
def test_multiband_attrs_nodata_still_band0(tmp_path):
224+
"""``attrs['nodata']`` for band=None reads is documented as band
225+
0's sentinel (the canonical attr cannot encode per-band values).
226+
The pixel-level fix must not change that contract.
227+
"""
228+
vrt_path = _write_two_band_per_band_nodata_vrt(tmp_path)
229+
r = read_vrt(vrt_path)
230+
assert r.attrs.get('nodata') == 65535.0

0 commit comments

Comments
 (0)