Skip to content

Commit 0732617

Browse files
committed
Fix read_geotiff_dask losing nodata mask in non-first chunks (#1597)
For an integer raster paired with an in-range nodata sentinel, read_geotiff_dask declares the output dtype as float64 but per-chunk _delayed_read_window only promoted a chunk to float64 when that chunk's mask actually hit. With the sentinel in a non-first chunk the top-left chunk stayed uint16, and dask concatenation preallocated the result from the first chunk's dtype -- silently casting later float64 chunks back to uint16 and converting their NaNs to 0. Always thread the resolved target_dtype through _delayed_read_window so every chunk lands as float64. Out-of-range sentinels still keep the file's native dtype because effective_dtype already short-circuits that case earlier in read_geotiff_dask. Tests cover the sentinel-in-corner regression, per-chunk dtype uniformity, the out-of-range #1581 path, and float-raster parity.
1 parent c5e873d commit 0732617

3 files changed

Lines changed: 128 additions & 2 deletions

File tree

.claude/sweep-metadata-state.csv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
module,last_inspected,issue,severity_max,categories_found,notes
2-
geotiff,2026-05-11,1580;1582,HIGH,1;3;4;5,"write_geotiff_gpu silently mis-ordered (band, y, x) DataArrays (#1580). to_geotiff ignored rioxarray nodatavals and CF _FillValue attrs (#1582). Both fixed in same branch."
2+
geotiff,2026-05-11,1597,HIGH,4;5,read_geotiff_dask dropped nodata mask when sentinel only hit non-first chunks (per-chunk dtype divergence; #1597).
33
reproject,2026-05-10,1572;1573,HIGH,1;3;4,geoid_height_raster dropped input attrs and used dims[-2:] for 3D inputs (#1572). reproject/merge ignored nodatavals (rasterio convention) when rioxarray absent (#1573). Fixed in same branch.

xrspatial/geotiff/__init__.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1641,13 +1641,23 @@ def read_geotiff_dask(source: str, *, dtype=None, chunks: int | tuple = 512,
16411641
# Translate window-relative chunk coords back to file-relative
16421642
# coords for ``read_to_array``. ``win_r0`` / ``win_c0`` are 0
16431643
# when no window was requested.
1644+
# Always thread ``target_dtype`` so each delayed chunk lands
1645+
# in the same dtype that the dask array declared. Without
1646+
# this, an integer raster with an in-range nodata sentinel
1647+
# would have ``effective_dtype=float64`` declared on the
1648+
# dask graph but per-chunk arrays only promoted when a
1649+
# chunk happened to contain a sentinel pixel. Dask
1650+
# concatenation then preallocates from the first chunk's
1651+
# actual dtype (uint16), silently casting later float64
1652+
# chunks back to int and converting their NaNs to 0. See
1653+
# issue #1597.
16441654
block = da.from_delayed(
16451655
_delayed_read_window(source,
16461656
r0 + win_r0, c0 + win_c0,
16471657
r1 + win_r0, c1 + win_c0,
16481658
overview_level, nodata,
16491659
band_arg,
1650-
target_dtype=target_dtype if dtype is not None else None,
1660+
target_dtype=target_dtype,
16511661
http_meta_key=http_meta_key,
16521662
max_pixels=max_pixels),
16531663
shape=block_shape,
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
"""Regression tests for issue #1597.
2+
3+
``read_geotiff_dask`` on an integer raster with an in-range nodata
4+
sentinel used to silently lose the mask when the sentinel only appeared
5+
in non-first chunks. Per-chunk dtype divergence (uint16 vs float64)
6+
caused dask concatenation to preallocate from the first chunk's actual
7+
dtype, casting float64 chunks back to int and converting NaN to 0.
8+
9+
The fix threads the resolved ``target_dtype`` (the dask graph's
10+
declared dtype) unconditionally through ``_delayed_read_window`` so
11+
every chunk lands as float64 regardless of whether its mask hit.
12+
"""
13+
from __future__ import annotations
14+
15+
import numpy as np
16+
import pytest
17+
18+
from xrspatial.geotiff import open_geotiff
19+
from xrspatial.geotiff._writer import write
20+
21+
22+
@pytest.fixture
23+
def uint16_with_sentinel_only_in_corner(tmp_path):
24+
"""Write a uint16 8x8 TIFF whose nodata sentinel is in the
25+
bottom-right 2x2 quadrant. With ``chunks=4`` the top-left chunk
26+
never sees a sentinel and used to keep its uint16 dtype.
27+
"""
28+
arr = np.arange(64, dtype=np.uint16).reshape(8, 8) + 1
29+
arr[6:8, 6:8] = 65535
30+
path = str(tmp_path / 'uint16_corner_sentinel_1597.tif')
31+
write(arr, path, nodata=65535, compression='none', tiled=False)
32+
return path, arr
33+
34+
35+
def test_eager_promotes_to_float64_and_masks(uint16_with_sentinel_only_in_corner):
36+
"""Baseline: the eager path produces float64 with 4 NaNs."""
37+
path, _ = uint16_with_sentinel_only_in_corner
38+
eager = open_geotiff(path)
39+
assert eager.dtype == np.float64
40+
assert np.isnan(eager.values).sum() == 4
41+
assert np.isnan(eager.values[6:8, 6:8]).all()
42+
43+
44+
def test_dask_chunks_4_matches_eager(uint16_with_sentinel_only_in_corner):
45+
"""The dask compute result matches the eager path bit-for-bit.
46+
47+
Before the fix this returned a uint16 array with 0s where the
48+
sentinel had been, because dask coerced the late-arriving float64
49+
chunk back to uint16 at concat time.
50+
"""
51+
path, _ = uint16_with_sentinel_only_in_corner
52+
eager = open_geotiff(path)
53+
dk = open_geotiff(path, chunks=4)
54+
assert dk.dtype == np.float64
55+
computed = dk.compute()
56+
assert computed.dtype == np.float64
57+
np.testing.assert_array_equal(np.isnan(computed.values),
58+
np.isnan(eager.values))
59+
finite = ~np.isnan(eager.values)
60+
np.testing.assert_array_equal(computed.values[finite],
61+
eager.values[finite])
62+
63+
64+
def test_dask_chunks_2_per_chunk_dtype_uniform(
65+
uint16_with_sentinel_only_in_corner):
66+
"""Every dask chunk returns float64 regardless of mask hit.
67+
68+
Iterates the delayed blocks and asserts each one computes to
69+
float64; the regression had the first chunk's actual data come back
70+
as uint16 because the mask never matched there.
71+
"""
72+
path, _ = uint16_with_sentinel_only_in_corner
73+
dk = open_geotiff(path, chunks=2)
74+
blocks = dk.data.to_delayed().flatten()
75+
for i, block in enumerate(blocks):
76+
chunk = block.compute()
77+
assert chunk.dtype == np.float64, (
78+
f"chunk {i} computed as {chunk.dtype}, expected float64; "
79+
f"per-chunk dtype divergence is the #1597 regression."
80+
)
81+
82+
83+
def test_dask_keeps_dtype_for_out_of_range_sentinel(tmp_path):
84+
"""Out-of-range sentinels (uint16 + nodata=-9999) stay uint16.
85+
86+
The fix should not regress #1581: when the sentinel cannot match
87+
any pixel, no float64 promotion is needed and the dask path keeps
88+
the file's native dtype.
89+
"""
90+
arr = np.array([[1, 2, 3, 4]] * 4, dtype=np.uint16)
91+
path = str(tmp_path / 'uint16_out_of_range_1597.tif')
92+
write(arr, path, nodata=-9999, compression='none', tiled=False)
93+
94+
dk = open_geotiff(path, chunks=2)
95+
assert dk.dtype == np.uint16
96+
result = dk.compute()
97+
assert result.dtype == np.uint16
98+
np.testing.assert_array_equal(result.values, arr)
99+
100+
101+
def test_dask_float_input_with_sentinel_in_one_chunk(tmp_path):
102+
"""Float rasters with sentinel in non-first chunk also stay float.
103+
104+
The float path doesn't promote dtype, but it does in-place NaN
105+
substitution. Verify the substitution holds for chunks with and
106+
without the sentinel.
107+
"""
108+
arr = np.arange(64, dtype=np.float32).reshape(8, 8) + 1
109+
arr[6:8, 6:8] = -9999.0
110+
path = str(tmp_path / 'float_corner_sentinel_1597.tif')
111+
write(arr, path, nodata=-9999, compression='none', tiled=False)
112+
113+
eager = open_geotiff(path)
114+
dk = open_geotiff(path, chunks=4).compute()
115+
np.testing.assert_array_equal(np.isnan(dk.values),
116+
np.isnan(eager.values))

0 commit comments

Comments
 (0)