diff --git a/.claude/sweep-metadata-state.csv b/.claude/sweep-metadata-state.csv index eecb4b04..0381aaff 100644 --- a/.claude/sweep-metadata-state.csv +++ b/.claude/sweep-metadata-state.csv @@ -1,5 +1,3 @@ module,last_inspected,issue,severity_max,categories_found,notes -geotiff,2026-05-11,1598,MEDIUM,4,"read_vrt(band=N) used vrt.bands[0].nodata for attrs and integer-promotion mask, mis-masking band N reads with per-band sentinels (#1598)." -geotiff,2026-05-11,1597,HIGH,4;5,read_geotiff_dask dropped nodata mask when sentinel only hit non-first chunks (per-chunk dtype divergence; #1597). +geotiff,2026-05-11,1616,HIGH,4,_vrt.read_vrt leaked integer source nodata sentinel through int->float cast when VRT declared a float dataType; downstream NaN-aware code saw sentinel as valid data (#1616). reproject,2026-05-10,1572;1573,HIGH,1;3;4,geoid_height_raster dropped input attrs and used dims[-2:] for 3D inputs (#1572). reproject/merge ignored nodatavals (rasterio convention) when rioxarray absent (#1573). Fixed in same branch. -geotiff,2026-05-11,1606,MEDIUM,1;5,_write_vrt_tiled dropped nodatavals/_FillValue aliases plus gdal_metadata/extra_tags/x_resolution/y_resolution/resolution_unit/raster_type; backend-inconsistent with to_geotiff(.tif) and write_geotiff_gpu (#1606). diff --git a/xrspatial/geotiff/_vrt.py b/xrspatial/geotiff/_vrt.py index 7df900f4..63bf60ab 100644 --- a/xrspatial/geotiff/_vrt.py +++ b/xrspatial/geotiff/_vrt.py @@ -372,6 +372,35 @@ def read_vrt(vrt_path: str, *, window=None, src_arr = src_arr.copy() sentinel = src_arr.dtype.type(src_nodata) src_arr[src_arr == sentinel] = np.nan + elif (src_nodata is not None + and src_arr.dtype.kind in ('u', 'i') + and result.dtype.kind == 'f'): + # Integer source feeding a float-dataType VRT. Without + # this branch the source's sentinel value (e.g. 65535 + # for uint16) flows through the int->float cast at the + # ``result[...] = src_arr[...]`` placement below as a + # literal float value, so downstream NaN-aware code + # sees the sentinel as valid data. Gate the cast on + # the sentinel being representable in the source dtype + # so out-of-range sentinels (e.g. uint16 file paired + # with GDAL_NODATA="-9999") stay no-op rather than + # tripping OverflowError on ``dtype.type(int(...))``. + # See issue #1616. + try: + nodata_f = float(src_nodata) + except (TypeError, ValueError): + nodata_f = None + if (nodata_f is not None + and np.isfinite(nodata_f) + and nodata_f.is_integer()): + info = np.iinfo(src_arr.dtype) + nodata_int = int(nodata_f) + if info.min <= nodata_int <= info.max: + sentinel = src_arr.dtype.type(nodata_int) + mask = src_arr == sentinel + if mask.any(): + src_arr = src_arr.astype(result.dtype) + src_arr[mask] = result.dtype.type('nan') # Apply ComplexSource scaling if src.scale is not None and src.scale != 1.0: diff --git a/xrspatial/geotiff/tests/test_vrt_int_source_float_dtype_1616.py b/xrspatial/geotiff/tests/test_vrt_int_source_float_dtype_1616.py new file mode 100644 index 00000000..4fa00555 --- /dev/null +++ b/xrspatial/geotiff/tests/test_vrt_int_source_float_dtype_1616.py @@ -0,0 +1,195 @@ +"""Regression tests for issue #1616. + +A VRT whose ```` (or Float64) is fed +by an integer source GeoTIFF with an in-range ``GDAL_NODATA`` sentinel +used to leak the sentinel value through as a literal float in the +returned array. ``attrs['nodata']`` advertised the sentinel but the +pixels at the sentinel positions still held the integer value cast to +float (e.g. ``65535.0`` rather than ``np.nan``). NaN-aware downstream +code therefore saw the sentinel as valid data. + +The fix masks integer source arrays before they're placed into a float +``result`` buffer in ``_vrt.read_vrt`` so a float-dtype VRT result lands +with NaN at the sentinel pixels, matching what ``open_geotiff`` returns +for a single-file integer raster with the same sentinel. +""" +from __future__ import annotations + +import numpy as np + +from xrspatial.geotiff import read_vrt +from xrspatial.geotiff._writer import write + + +def _write_uint16_with_sentinel(tmp_path, sentinel=65535, filename='b0.tif'): + band = np.array([[1, 2], [3, sentinel]], dtype=np.uint16) + p = str(tmp_path / filename) + write(band, p, nodata=sentinel, compression='none', tiled=False) + return p + + +def _write_int16_with_sentinel(tmp_path, sentinel=-1, filename='b0.tif'): + band = np.array([[1, 2], [3, sentinel]], dtype=np.int16) + p = str(tmp_path / filename) + write(band, p, nodata=sentinel, compression='none', tiled=False) + return p + + +def _build_vrt(tmp_path, source_path, vrt_dtype, nodata_value, + filename='mismatch.vrt'): + """Hand-roll a VRT with the requested dataType / NoDataValue pair.""" + vrt_xml = f""" + 0.0, 1.0, 0.0, 0.0, 0.0, -1.0 + + {nodata_value} + + {source_path} + 1 + + + + +""" + p = str(tmp_path / filename) + with open(p, 'w') as f: + f.write(vrt_xml) + return p + + +def test_float32_vrt_uint16_source_masks_in_range_sentinel(tmp_path): + """Float32 VRT, uint16 source with in-range sentinel: pixel becomes NaN. + + Before the fix this returned dtype=float32 with values[1, 1] == 65535.0 + while ``attrs['nodata']`` advertised the sentinel. + """ + src = _write_uint16_with_sentinel(tmp_path) + vrt = _build_vrt(tmp_path, src, 'Float32', 65535) + r = read_vrt(vrt) + assert r.dtype == np.float32, ( + f"Float32-declared VRT should return float32, got {r.dtype}") + assert np.isnan(r.values[1, 1]), ( + "Sentinel pixel (uint16 65535 -> float32) should be NaN-masked; " + f"got values[1, 1]={r.values[1, 1]}") + assert r.attrs.get('nodata') == 65535.0 + assert r.values[0, 0] == 1.0 + + +def test_float64_vrt_int16_source_masks_negative_sentinel(tmp_path): + """Float64 VRT, int16 source with negative sentinel: pixel becomes NaN.""" + src = _write_int16_with_sentinel(tmp_path, sentinel=-1) + vrt = _build_vrt(tmp_path, src, 'Float64', -1) + r = read_vrt(vrt) + assert r.dtype == np.float64 + assert np.isnan(r.values[1, 1]), ( + f"Sentinel pixel (-1) should be NaN-masked; " + f"got values[1, 1]={r.values[1, 1]}") + assert r.attrs.get('nodata') == -1.0 + + +def test_float32_vrt_out_of_range_sentinel_is_noop(tmp_path): + """An out-of-range sentinel (e.g. uint16 source + NoDataValue=-9999) + stays unmasked rather than raising ``OverflowError`` from the + ``uint16(-9999)`` cast. The pixel data is returned as-is and + ``attrs['nodata']`` still surfaces the declared sentinel so callers + can mask in user code or write through. + """ + arr = np.array([[1, 2], [3, 4]], dtype=np.uint16) + p = str(tmp_path / 'b0_no_nodata.tif') + write(arr, p, compression='none', tiled=False) # no GDAL_NODATA on file + vrt = _build_vrt(tmp_path, p, 'Float32', -9999) + r = read_vrt(vrt) + assert r.dtype == np.float32 + # No pixels match the (out-of-range) sentinel, so nothing was masked. + assert not np.isnan(r.values).any() + assert r.attrs.get('nodata') == -9999.0 + + +def test_float32_vrt_uint16_source_no_sentinel_pixels(tmp_path): + """Float32 VRT, uint16 source whose pixels do not match the sentinel: + the result is a clean float array with no NaNs introduced. + + This exercises the early-out path inside the new mask branch -- a + declared sentinel that matches no pixels must not perturb the data + or cause an extra copy that would surface as a different dtype. + """ + arr = np.array([[1, 2], [3, 4]], dtype=np.uint16) + p = str(tmp_path / 'b0_clean.tif') + write(arr, p, compression='none', tiled=False) + vrt = _build_vrt(tmp_path, p, 'Float32', 65535) + r = read_vrt(vrt) + assert r.dtype == np.float32 + assert not np.isnan(r.values).any() + np.testing.assert_array_equal(r.values, arr.astype(np.float32)) + + +def test_float_vrt_int_source_dask_path_masks_sentinel(tmp_path): + """The dask wrapper path (``chunks=...``) also returns NaN at the + sentinel pixel. The dask reader chunks the eager result after decode, + so the bug propagates if the eager path leaks the sentinel. + """ + src = _write_uint16_with_sentinel(tmp_path) + vrt = _build_vrt(tmp_path, src, 'Float32', 65535) + r = read_vrt(vrt, chunks=2) + # Dask path keeps the float32 dtype declared by the VRT. + assert r.dtype == np.float32 + val = r.values + assert np.isnan(val[1, 1]) + + +def test_float_vrt_int_source_round_trip_nodata_attr(tmp_path): + """Even though the masking promotes pixels to NaN, the + ``attrs['nodata']`` value still carries the original sentinel so a + downstream write can restore the literal sentinel byte pattern. + """ + src = _write_uint16_with_sentinel(tmp_path) + vrt = _build_vrt(tmp_path, src, 'Float32', 65535) + r = read_vrt(vrt) + assert r.attrs.get('nodata') == 65535.0 + + +def test_float_vrt_int_source_with_band_select(tmp_path): + """The band=N selection path also masks integer sentinels for a + float-declared VRT. The per-band ``NoDataValue`` from the VRT XML + must reach the source-side masking step, not just ``attrs['nodata']``. + """ + src_a = _write_uint16_with_sentinel(tmp_path, sentinel=65535, + filename='ba.tif') + src_b = _write_uint16_with_sentinel(tmp_path, sentinel=65000, + filename='bb.tif') + vrt_xml = f""" + 0.0, 1.0, 0.0, 0.0, 0.0, -1.0 + + 65535 + + {src_a} + 1 + + + + + + 65000 + + {src_b} + 1 + + + + +""" + vrt_path = str(tmp_path / 'mb.vrt') + with open(vrt_path, 'w') as f: + f.write(vrt_xml) + + # band 0 -> 65535 sentinel masked + r0 = read_vrt(vrt_path, band=0) + assert r0.dtype == np.float32 + assert np.isnan(r0.values[1, 1]) + assert r0.attrs.get('nodata') == 65535.0 + + # band 1 -> 65000 sentinel masked, not 65535 + r1 = read_vrt(vrt_path, band=1) + assert r1.dtype == np.float32 + # band b had its sentinel at the same [1, 1] cell + assert np.isnan(r1.values[1, 1]) + assert r1.attrs.get('nodata') == 65000.0