Skip to content

Commit 8d17a84

Browse files
authored
Skip no-op astype in _delayed_read_window (#1624) (#1626)
* Skip no-op astype in _delayed_read_window (#1624) After #1601 widened the call site to always pass target_dtype, every dask chunk ran arr.astype(target_dtype) even when arr.dtype already matched. numpy.astype defaults to copy=True and so allocated a same- dtype chunk-sized buffer and memcpy on every read. Gate the cast on a real dtype mismatch; the #1597 mask path still promotes uint to float64 inline so every chunk lands in the dask-declared dtype. Regression test in test_dask_no_op_astype_1624.py wraps read_to_array to capture an ndarray subclass that records astype calls, then asserts no same-dtype call survives the per-chunk return path. * Address review: drop stub test, exercise uint16 fixture array Copilot review on #1626 flagged two unused-variable issues. Fixes: - Delete test_float32_chunks_avoid_redundant_copy. It declared same_dtype_casts and wrapped _delayed_read_window without recording anything, so it only asserted output dtype -- which does not validate the #1624 regression. test_astype_skipped_when_dtypes_match already covers the same surface properly via an ndarray subclass that tracks astype calls; verified it fails on the pre-fix code with "Same-dtype astype still runs per chunk". - In test_uint16_mask_path_still_promotes, exercise the previously- unused arr fixture: assert that pixels equal to the 65535 sentinel become NaN and every other pixel matches arr.astype(float64). Anchors the test to fixture values so regressions in the mask branch surface here, not just as dtype drift.
1 parent 9a5f55e commit 8d17a84

2 files changed

Lines changed: 137 additions & 1 deletion

File tree

xrspatial/geotiff/__init__.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1868,7 +1868,14 @@ def _read(http_meta):
18681868
if mask.any():
18691869
arr = arr.astype(np.float64)
18701870
arr[mask] = np.nan
1871-
if target_dtype is not None:
1871+
if target_dtype is not None and arr.dtype != target_dtype:
1872+
# Skip the cast when dtype already matches. ``numpy.astype``
1873+
# defaults to ``copy=True`` and would otherwise allocate a
1874+
# full chunk-sized buffer and memcpy on every read just to
1875+
# land in the same dtype the array already has. The int->
1876+
# float64 promotion above (sentinel-hit branch) keeps the
1877+
# contract that every chunk lands in the dask-declared
1878+
# dtype; this guard only elides no-op casts. See #1624.
18721879
arr = arr.astype(target_dtype)
18731880
return arr
18741881
return _read(http_meta_key)
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
"""Regression tests for issue #1624.
2+
3+
After #1597/#1601 widened ``_delayed_read_window`` to always pass
4+
``target_dtype`` through to per-chunk reads, every chunk ran
5+
``arr.astype(target_dtype)`` even when ``arr.dtype == target_dtype``
6+
already. ``numpy.ndarray.astype`` defaults to ``copy=True`` and so
7+
allocated a same-dtype chunk-sized buffer and memcpy on every chunk of
8+
every read, doubling peak per-chunk memory on plain float reads.
9+
10+
The fix gates the astype on a real dtype mismatch. The #1597 mask path
11+
still promotes uint -> float64 inline so every chunk lands in the
12+
dask-declared dtype.
13+
"""
14+
from __future__ import annotations
15+
16+
import numpy as np
17+
import pytest
18+
19+
from xrspatial.geotiff import open_geotiff, read_geotiff_dask
20+
from xrspatial.geotiff._writer import write
21+
22+
23+
@pytest.fixture
24+
def float32_no_nodata_tif(tmp_path):
25+
"""Write a 16x16 float32 TIFF with no nodata sentinel."""
26+
rng = np.random.RandomState(1624)
27+
arr = rng.rand(16, 16).astype(np.float32)
28+
path = str(tmp_path / 'float32_no_nodata_1624.tif')
29+
write(arr, path, compression='none', tiled=False)
30+
return path, arr
31+
32+
33+
@pytest.fixture
34+
def uint16_with_sentinel_in_first_chunk(tmp_path):
35+
"""uint16 raster with sentinel in chunk 0 so the mask hits there."""
36+
arr = np.arange(64, dtype=np.uint16).reshape(8, 8) + 1
37+
arr[0, 0] = 65535
38+
arr[6, 6] = 65535
39+
path = str(tmp_path / 'uint16_sentinel_1624.tif')
40+
write(arr, path, nodata=65535, compression='none', tiled=False)
41+
return path, arr
42+
43+
44+
def test_uint16_mask_path_still_promotes(uint16_with_sentinel_in_first_chunk):
45+
"""The #1597 promotion still runs when sentinels are present."""
46+
path, arr = uint16_with_sentinel_in_first_chunk
47+
eager = open_geotiff(path)
48+
dk = open_geotiff(path, chunks=4)
49+
assert dk.dtype == np.float64
50+
computed = dk.compute()
51+
assert computed.dtype == np.float64
52+
np.testing.assert_array_equal(np.isnan(computed.values),
53+
np.isnan(eager.values))
54+
# Pixels that held the sentinel in the source array are NaN; every
55+
# other pixel matches the source value byte-for-byte after the
56+
# uint -> float64 promotion. Anchors the test to fixture values so
57+
# any regression in the mask path (e.g. wrong sentinel comparison)
58+
# surfaces here, not just as dtype drift.
59+
sentinel_mask = arr == 65535
60+
np.testing.assert_array_equal(np.isnan(computed.values), sentinel_mask)
61+
np.testing.assert_array_equal(
62+
computed.values[~sentinel_mask],
63+
arr[~sentinel_mask].astype(np.float64),
64+
)
65+
66+
67+
def test_astype_skipped_when_dtypes_match(float32_no_nodata_tif, monkeypatch):
68+
"""Direct trace: no astype runs on the per-chunk return path when
69+
``target_dtype`` already matches.
70+
71+
Wraps ``read_to_array`` so the array it returns is a subclass that
72+
flips a flag whenever ``astype`` is called. With the bug, every
73+
chunk triggers one same-dtype astype. With the fix, none do.
74+
"""
75+
from xrspatial.geotiff import _reader as reader_mod
76+
import xrspatial.geotiff as gt
77+
78+
path, _ = float32_no_nodata_tif
79+
80+
class _AstypeTrackingArray(np.ndarray):
81+
"""ndarray subclass that records astype calls."""
82+
83+
def __new__(cls, input_array):
84+
obj = np.asarray(input_array).view(cls)
85+
obj._astype_calls = []
86+
return obj
87+
88+
def __array_finalize__(self, obj):
89+
if obj is None:
90+
return
91+
self._astype_calls = getattr(obj, '_astype_calls', [])
92+
93+
def astype(self, dtype, *args, **kwargs):
94+
self._astype_calls.append(np.dtype(dtype))
95+
return super().astype(dtype, *args, **kwargs)
96+
97+
captured: list = []
98+
99+
orig_r2a = reader_mod.read_to_array
100+
101+
def wrapped_r2a(*args, **kwargs):
102+
arr, meta = orig_r2a(*args, **kwargs)
103+
tracked = _AstypeTrackingArray(arr)
104+
captured.append(tracked)
105+
return tracked, meta
106+
107+
monkeypatch.setattr(gt, 'read_to_array', wrapped_r2a)
108+
109+
dk = read_geotiff_dask(path, chunks=4)
110+
dk.compute()
111+
112+
assert captured, "read_to_array was not invoked"
113+
for tracked in captured:
114+
same_dtype_calls = [c for c in tracked._astype_calls
115+
if c == tracked.dtype]
116+
assert not same_dtype_calls, (
117+
f"Same-dtype astype still runs per chunk "
118+
f"(dtype={tracked.dtype}, calls={tracked._astype_calls}); "
119+
f"this is the #1624 regression."
120+
)
121+
122+
123+
def test_caller_supplied_dtype_still_casts(float32_no_nodata_tif):
124+
"""Explicit ``dtype=float64`` still triggers the cast."""
125+
path, _ = float32_no_nodata_tif
126+
dk = read_geotiff_dask(path, dtype=np.float64, chunks=4)
127+
assert dk.dtype == np.float64
128+
out = dk.compute()
129+
assert out.dtype == np.float64

0 commit comments

Comments
 (0)