Skip to content

Commit 733db7d

Browse files
committed
Fix read_vrt leaking integer source nodata in float-dataType VRT (#1616)
A VRT declaring ``dataType="Float32"`` or ``"Float64"`` over integer source GeoTIFFs used to leak the GDAL_NODATA sentinel value as a literal float (e.g. 65535.0) into the returned array. ``attrs['nodata']`` advertised the sentinel but NaN-aware downstream code (``np.isnan``, ``xr.where``, slope/aspect, statistics) saw the sentinel pixels as valid data and produced wrong answers. The eager numpy, dask, and GPU paths all funnel through ``_vrt.read_vrt``, which only NaN-masked source arrays whose dtype was already float. Mask integer source arrays too when the result buffer is float, gated on the sentinel being representable in the source dtype so out-of-range values (uint16 file + ``NoDataValue=-9999``) stay no-op rather than tripping ``OverflowError``. Single-file integer rasters already get the sentinel-to-NaN promotion from the wrappers in ``__init__.py``; this fix brings the VRT path in line with that semantics for the int-source + float-dataType combo.
1 parent 0088dfe commit 733db7d

3 files changed

Lines changed: 226 additions & 3 deletions

File tree

.claude/sweep-metadata-state.csv

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
11
module,last_inspected,issue,severity_max,categories_found,notes
2-
geotiff,2026-05-11,1598,MEDIUM,4,"read_vrt(band=N) used vrt.bands[0].nodata for attrs and integer-promotion mask, mis-masking band N reads with per-band sentinels (#1598)."
3-
geotiff,2026-05-11,1597,HIGH,4;5,read_geotiff_dask dropped nodata mask when sentinel only hit non-first chunks (per-chunk dtype divergence; #1597).
2+
geotiff,2026-05-11,1616,HIGH,4,_vrt.read_vrt leaked integer source nodata sentinel through int->float cast when VRT declared a float dataType; downstream NaN-aware code saw sentinel as valid data (#1616).
43
reproject,2026-05-10,1572;1573,HIGH,1;3;4,geoid_height_raster dropped input attrs and used dims[-2:] for 3D inputs (#1572). reproject/merge ignored nodatavals (rasterio convention) when rioxarray absent (#1573). Fixed in same branch.
5-
geotiff,2026-05-11,1606,MEDIUM,1;5,_write_vrt_tiled dropped nodatavals/_FillValue aliases plus gdal_metadata/extra_tags/x_resolution/y_resolution/resolution_unit/raster_type; backend-inconsistent with to_geotiff(.tif) and write_geotiff_gpu (#1606).

xrspatial/geotiff/_vrt.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -372,6 +372,35 @@ def read_vrt(vrt_path: str, *, window=None,
372372
src_arr = src_arr.copy()
373373
sentinel = src_arr.dtype.type(src_nodata)
374374
src_arr[src_arr == sentinel] = np.nan
375+
elif (src_nodata is not None
376+
and src_arr.dtype.kind in ('u', 'i')
377+
and result.dtype.kind == 'f'):
378+
# Integer source feeding a float-dataType VRT. Without
379+
# this branch the source's sentinel value (e.g. 65535
380+
# for uint16) flows through the int->float cast at the
381+
# ``result[...] = src_arr[...]`` placement below as a
382+
# literal float value, so downstream NaN-aware code
383+
# sees the sentinel as valid data. Gate the cast on
384+
# the sentinel being representable in the source dtype
385+
# so out-of-range sentinels (e.g. uint16 file paired
386+
# with GDAL_NODATA="-9999") stay no-op rather than
387+
# tripping OverflowError on ``dtype.type(int(...))``.
388+
# See issue #1616.
389+
try:
390+
nodata_f = float(src_nodata)
391+
except (TypeError, ValueError):
392+
nodata_f = None
393+
if (nodata_f is not None
394+
and np.isfinite(nodata_f)
395+
and nodata_f.is_integer()):
396+
info = np.iinfo(src_arr.dtype)
397+
nodata_int = int(nodata_f)
398+
if info.min <= nodata_int <= info.max:
399+
sentinel = src_arr.dtype.type(nodata_int)
400+
mask = src_arr == sentinel
401+
if mask.any():
402+
src_arr = src_arr.astype(result.dtype)
403+
src_arr[mask] = result.dtype.type('nan')
375404

376405
# Apply ComplexSource scaling
377406
if src.scale is not None and src.scale != 1.0:
Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
"""Regression tests for issue #1616.
2+
3+
A VRT whose ``<VRTRasterBand dataType="Float32">`` (or Float64) is fed
4+
by an integer source GeoTIFF with an in-range ``GDAL_NODATA`` sentinel
5+
used to leak the sentinel value through as a literal float in the
6+
returned array. ``attrs['nodata']`` advertised the sentinel but the
7+
pixels at the sentinel positions still held the integer value cast to
8+
float (e.g. ``65535.0`` rather than ``np.nan``). NaN-aware downstream
9+
code therefore saw the sentinel as valid data.
10+
11+
The fix masks integer source arrays before they're placed into a float
12+
``result`` buffer in ``_vrt.read_vrt`` so a float-dtype VRT result lands
13+
with NaN at the sentinel pixels, matching what ``open_geotiff`` returns
14+
for a single-file integer raster with the same sentinel.
15+
"""
16+
from __future__ import annotations
17+
18+
import numpy as np
19+
import pytest
20+
21+
from xrspatial.geotiff import read_vrt
22+
from xrspatial.geotiff._writer import write
23+
24+
25+
def _write_uint16_with_sentinel(tmp_path, sentinel=65535, filename='b0.tif'):
26+
band = np.array([[1, 2], [3, sentinel]], dtype=np.uint16)
27+
p = str(tmp_path / filename)
28+
write(band, p, nodata=sentinel, compression='none', tiled=False)
29+
return p
30+
31+
32+
def _write_int16_with_sentinel(tmp_path, sentinel=-1, filename='b0.tif'):
33+
band = np.array([[1, 2], [3, sentinel]], dtype=np.int16)
34+
p = str(tmp_path / filename)
35+
write(band, p, nodata=sentinel, compression='none', tiled=False)
36+
return p
37+
38+
39+
def _build_vrt(tmp_path, source_path, vrt_dtype, nodata_value,
40+
filename='mismatch.vrt'):
41+
"""Hand-roll a VRT with the requested dataType / NoDataValue pair."""
42+
vrt_xml = f"""<VRTDataset rasterXSize="2" rasterYSize="2">
43+
<GeoTransform>0.0, 1.0, 0.0, 0.0, 0.0, -1.0</GeoTransform>
44+
<VRTRasterBand dataType="{vrt_dtype}" band="1">
45+
<NoDataValue>{nodata_value}</NoDataValue>
46+
<SimpleSource>
47+
<SourceFilename relativeToVRT="0">{source_path}</SourceFilename>
48+
<SourceBand>1</SourceBand>
49+
<SrcRect xOff="0" yOff="0" xSize="2" ySize="2"/>
50+
<DstRect xOff="0" yOff="0" xSize="2" ySize="2"/>
51+
</SimpleSource>
52+
</VRTRasterBand>
53+
</VRTDataset>"""
54+
p = str(tmp_path / filename)
55+
with open(p, 'w') as f:
56+
f.write(vrt_xml)
57+
return p
58+
59+
60+
def test_float32_vrt_uint16_source_masks_in_range_sentinel(tmp_path):
61+
"""Float32 VRT, uint16 source with in-range sentinel: pixel becomes NaN.
62+
63+
Before the fix this returned dtype=float32 with values[1, 1] == 65535.0
64+
while ``attrs['nodata']`` advertised the sentinel.
65+
"""
66+
src = _write_uint16_with_sentinel(tmp_path)
67+
vrt = _build_vrt(tmp_path, src, 'Float32', 65535)
68+
r = read_vrt(vrt)
69+
assert r.dtype == np.float32, (
70+
f"Float32-declared VRT should return float32, got {r.dtype}")
71+
assert np.isnan(r.values[1, 1]), (
72+
"Sentinel pixel (uint16 65535 -> float32) should be NaN-masked; "
73+
f"got values[1, 1]={r.values[1, 1]}")
74+
assert r.attrs.get('nodata') == 65535.0
75+
assert r.values[0, 0] == 1.0
76+
77+
78+
def test_float64_vrt_int16_source_masks_negative_sentinel(tmp_path):
79+
"""Float64 VRT, int16 source with negative sentinel: pixel becomes NaN."""
80+
src = _write_int16_with_sentinel(tmp_path, sentinel=-1)
81+
vrt = _build_vrt(tmp_path, src, 'Float64', -1)
82+
r = read_vrt(vrt)
83+
assert r.dtype == np.float64
84+
assert np.isnan(r.values[1, 1]), (
85+
f"Sentinel pixel (-1) should be NaN-masked; "
86+
f"got values[1, 1]={r.values[1, 1]}")
87+
assert r.attrs.get('nodata') == -1.0
88+
89+
90+
def test_float32_vrt_out_of_range_sentinel_is_noop(tmp_path):
91+
"""An out-of-range sentinel (e.g. uint16 source + NoDataValue=-9999)
92+
stays unmasked rather than raising ``OverflowError`` from the
93+
``uint16(-9999)`` cast. The pixel data is returned as-is and
94+
``attrs['nodata']`` still surfaces the declared sentinel so callers
95+
can mask in user code or write through.
96+
"""
97+
arr = np.array([[1, 2], [3, 4]], dtype=np.uint16)
98+
p = str(tmp_path / 'b0_no_nodata.tif')
99+
write(arr, p, compression='none', tiled=False) # no GDAL_NODATA on file
100+
vrt = _build_vrt(tmp_path, p, 'Float32', -9999)
101+
r = read_vrt(vrt)
102+
assert r.dtype == np.float32
103+
# No pixels match the (out-of-range) sentinel, so nothing was masked.
104+
assert not np.isnan(r.values).any()
105+
assert r.attrs.get('nodata') == -9999.0
106+
107+
108+
def test_float32_vrt_uint16_source_no_sentinel_pixels(tmp_path):
109+
"""Float32 VRT, uint16 source whose pixels do not match the sentinel:
110+
the result is a clean float array with no NaNs introduced.
111+
112+
This exercises the early-out path inside the new mask branch -- a
113+
declared sentinel that matches no pixels must not perturb the data
114+
or cause an extra copy that would surface as a different dtype.
115+
"""
116+
arr = np.array([[1, 2], [3, 4]], dtype=np.uint16)
117+
p = str(tmp_path / 'b0_clean.tif')
118+
write(arr, p, compression='none', tiled=False)
119+
vrt = _build_vrt(tmp_path, p, 'Float32', 65535)
120+
r = read_vrt(vrt)
121+
assert r.dtype == np.float32
122+
assert not np.isnan(r.values).any()
123+
np.testing.assert_array_equal(r.values, arr.astype(np.float32))
124+
125+
126+
def test_float_vrt_int_source_dask_path_masks_sentinel(tmp_path):
127+
"""The dask wrapper path (``chunks=...``) also returns NaN at the
128+
sentinel pixel. The dask reader chunks the eager result after decode,
129+
so the bug propagates if the eager path leaks the sentinel.
130+
"""
131+
src = _write_uint16_with_sentinel(tmp_path)
132+
vrt = _build_vrt(tmp_path, src, 'Float32', 65535)
133+
r = read_vrt(vrt, chunks=2)
134+
# Dask path keeps the float32 dtype declared by the VRT.
135+
assert r.dtype == np.float32
136+
val = r.values
137+
assert np.isnan(val[1, 1])
138+
139+
140+
def test_float_vrt_int_source_round_trip_nodata_attr(tmp_path):
141+
"""Even though the masking promotes pixels to NaN, the
142+
``attrs['nodata']`` value still carries the original sentinel so a
143+
downstream write can restore the literal sentinel byte pattern.
144+
"""
145+
src = _write_uint16_with_sentinel(tmp_path)
146+
vrt = _build_vrt(tmp_path, src, 'Float32', 65535)
147+
r = read_vrt(vrt)
148+
assert r.attrs.get('nodata') == 65535.0
149+
150+
151+
def test_float_vrt_int_source_with_band_select(tmp_path):
152+
"""The band=N selection path also masks integer sentinels for a
153+
float-declared VRT. The per-band ``NoDataValue`` from the VRT XML
154+
must reach the source-side masking step, not just ``attrs['nodata']``.
155+
"""
156+
src_a = _write_uint16_with_sentinel(tmp_path, sentinel=65535,
157+
filename='ba.tif')
158+
src_b = _write_uint16_with_sentinel(tmp_path, sentinel=65000,
159+
filename='bb.tif')
160+
vrt_xml = f"""<VRTDataset rasterXSize="2" rasterYSize="2">
161+
<GeoTransform>0.0, 1.0, 0.0, 0.0, 0.0, -1.0</GeoTransform>
162+
<VRTRasterBand dataType="Float32" band="1">
163+
<NoDataValue>65535</NoDataValue>
164+
<SimpleSource>
165+
<SourceFilename relativeToVRT="0">{src_a}</SourceFilename>
166+
<SourceBand>1</SourceBand>
167+
<SrcRect xOff="0" yOff="0" xSize="2" ySize="2"/>
168+
<DstRect xOff="0" yOff="0" xSize="2" ySize="2"/>
169+
</SimpleSource>
170+
</VRTRasterBand>
171+
<VRTRasterBand dataType="Float32" band="2">
172+
<NoDataValue>65000</NoDataValue>
173+
<SimpleSource>
174+
<SourceFilename relativeToVRT="0">{src_b}</SourceFilename>
175+
<SourceBand>1</SourceBand>
176+
<SrcRect xOff="0" yOff="0" xSize="2" ySize="2"/>
177+
<DstRect xOff="0" yOff="0" xSize="2" ySize="2"/>
178+
</SimpleSource>
179+
</VRTRasterBand>
180+
</VRTDataset>"""
181+
vrt_path = str(tmp_path / 'mb.vrt')
182+
with open(vrt_path, 'w') as f:
183+
f.write(vrt_xml)
184+
185+
# band 0 -> 65535 sentinel masked
186+
r0 = read_vrt(vrt_path, band=0)
187+
assert r0.dtype == np.float32
188+
assert np.isnan(r0.values[1, 1])
189+
assert r0.attrs.get('nodata') == 65535.0
190+
191+
# band 1 -> 65000 sentinel masked, not 65535
192+
r1 = read_vrt(vrt_path, band=1)
193+
assert r1.dtype == np.float32
194+
# band b had its sentinel at the same [1, 1] cell
195+
assert np.isnan(r1.values[1, 1])
196+
assert r1.attrs.get('nodata') == 65000.0

0 commit comments

Comments
 (0)