Skip to content

Commit 58a8b3d

Browse files
authored
Add Hypothesis property/fuzz tests for geotiff + fix 3 typed-error gaps (#1661) (#1666)
Adds xrspatial/geotiff/tests/test_fuzz_hypothesis_1661.py with three property groups: round-trip equality through to_geotiff/open_geotiff, IFD layout permutations via make_minimal_tiff, and single-byte mutation of a small corpus. Hypothesis is gated with pytest.importorskip so the suite still collects without the dependency installed. The byte-mutation property surfaced three real bugs on first run. All three turned into bare IndexError / ZeroDivisionError on a corrupt header byte instead of the documented ValueError / TypeError contract. Fixed: - _read_strips ZeroDivisionError when RowsPerStrip == 0 - _read_strips IndexError when StripByteCounts is shorter than the expected strip count (placed after the existing dimension safety check so a crafted huge-header still raises the safety-limit error first) - IFD.sample_format IndexError when SampleFormat has count=0; falls back to the default sample_format = 1 (unsigned int) Three targeted regression tests lock in the fixes alongside the property tests that surfaced them.
1 parent 5f6095c commit 58a8b3d

3 files changed

Lines changed: 347 additions & 0 deletions

File tree

xrspatial/geotiff/_header.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,13 @@ def samples_per_pixel(self) -> int:
165165
def sample_format(self) -> int:
166166
v = self.get_value(TAG_SAMPLE_FORMAT, 1)
167167
if isinstance(v, tuple):
168+
# A SampleFormat tag with count=0 has been seen in malformed
169+
# TIFFs (single-byte corruption flips the count field). Fall back
170+
# to the default rather than raising IndexError -- the caller can
171+
# then either succeed with a sensible dtype or fail with a typed
172+
# ValueError downstream.
173+
if len(v) == 0:
174+
return 1
168175
return v[0]
169176
return v
170177

xrspatial/geotiff/_reader.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -923,6 +923,12 @@ def _read_strips(data: bytes, ifd: IFD, header: TIFFHeader,
923923
if offsets is None or byte_counts is None:
924924
raise ValueError("Missing strip offsets or byte counts")
925925

926+
# A corrupt header can report RowsPerStrip=0, which would divide by zero
927+
# below. Reject it as a typed parse error rather than letting the
928+
# ZeroDivisionError leak out to the caller.
929+
if rps is None or rps <= 0:
930+
raise ValueError(f"Invalid RowsPerStrip: {rps!r}")
931+
926932
planar = ifd.planar_config # 1=chunky (interleaved), 2=planar (separate)
927933

928934
# Determine output region
@@ -940,6 +946,17 @@ def _read_strips(data: bytes, ifd: IFD, header: TIFFHeader,
940946

941947
_check_dimensions(out_w, out_h, samples, max_pixels)
942948

949+
# StripByteCounts must have at least one entry per strip; a corrupt count
950+
# field can shrink it. Detect the mismatch after the dimension safety
951+
# check so an oversized header raises the safety-limit error first, then
952+
# raise a typed ValueError here instead of IndexError when the loop
953+
# indexes past the end.
954+
n_strips_expected = (height + rps - 1) // rps
955+
if len(offsets) < n_strips_expected or len(byte_counts) < n_strips_expected:
956+
raise ValueError(
957+
f"Strip table truncated: expected {n_strips_expected} entries, "
958+
f"got offsets={len(offsets)}, byte_counts={len(byte_counts)}")
959+
943960
# Sparse strips (StripByteCounts == 0) must materialise as nodata or 0
944961
# rather than be decoded. Pre-fill the result so any skipped strips
945962
# land on a known fill value.
Lines changed: 323 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,323 @@
1+
"""Hypothesis property and fuzz tests for the geotiff module (#1661).
2+
3+
Three property groups:
4+
5+
1. Round-trip: random valid (dtype, compression, tiled, predictor, nodata) ->
6+
write with ``to_geotiff`` -> read with ``open_geotiff`` -> assert array
7+
equality and attrs preservation.
8+
9+
2. IFD layout permutations via ``make_minimal_tiff``: assert ``open_geotiff``
10+
returns a valid array, or raises ``ValueError`` / ``TypeError`` from the
11+
geotiff module. Never bare ``IndexError`` / ``struct.error`` /
12+
``UnicodeDecodeError``.
13+
14+
3. Single-byte mutation: flip one byte in a valid TIFF at a Hypothesis-chosen
15+
offset. Reader must either parse consistently or raise a typed exception.
16+
17+
The whole file is skipped if ``hypothesis`` is not installed -- it is not a
18+
hard test dep yet (see issue #1661 unresolved questions). Each test bounds
19+
example count and disables Hypothesis's deadline so CI variance doesn't
20+
flake.
21+
"""
22+
from __future__ import annotations
23+
24+
import io
25+
import struct
26+
27+
import numpy as np
28+
import pytest
29+
import xarray as xr
30+
31+
hypothesis = pytest.importorskip("hypothesis")
32+
from hypothesis import HealthCheck, example, given, settings # noqa: E402
33+
from hypothesis import strategies as st # noqa: E402
34+
35+
from xrspatial.geotiff import open_geotiff, to_geotiff # noqa: E402
36+
37+
from .conftest import make_minimal_tiff # noqa: E402
38+
39+
40+
# Exception types the geotiff module is allowed to raise on invalid input.
41+
# Any other exception class indicates an undocumented failure mode -- either
42+
# the strategy generated something we should reject explicitly, or there's
43+
# a real bug.
44+
ALLOWED_PARSE_EXCEPTIONS = (ValueError, TypeError)
45+
46+
# Codecs safe for round-trip on every dtype in our strategy. 'jpeg' is
47+
# explicitly rejected on write (see _VALID_COMPRESSIONS docstring); 'lerc' and
48+
# 'jpeg2000' are lossy or dtype-restricted and would need their own narrower
49+
# strategies, so they're omitted here.
50+
LOSSLESS_CODECS = ['none', 'deflate', 'lzw', 'packbits', 'zstd', 'lz4']
51+
52+
# Dtype set kept small to keep CI fast. Float and int, signed and unsigned.
53+
ROUND_TRIP_DTYPES = ['uint8', 'uint16', 'int16', 'int32', 'float32', 'float64']
54+
55+
56+
# --- Strategies ---
57+
58+
@st.composite
59+
def round_trip_inputs(draw):
60+
"""Generate (DataArray, compression, tiled, predictor) for round-trip."""
61+
width = draw(st.integers(min_value=1, max_value=32))
62+
height = draw(st.integers(min_value=1, max_value=32))
63+
dtype = draw(st.sampled_from(ROUND_TRIP_DTYPES))
64+
compression = draw(st.sampled_from(LOSSLESS_CODECS))
65+
tiled = draw(st.booleans())
66+
67+
np_dtype = np.dtype(dtype)
68+
if np_dtype.kind == 'f':
69+
# Predictor 3 is for floats only; 0/1 means no predictor.
70+
predictor = draw(st.sampled_from([False, 3]))
71+
data = draw(st.integers(min_value=0, max_value=1_000_000))
72+
rng = np.random.default_rng(data)
73+
arr = rng.standard_normal((height, width)).astype(np_dtype)
74+
else:
75+
# Predictor 2 is horizontal differencing, good for ints.
76+
predictor = draw(st.sampled_from([False, 2]))
77+
seed = draw(st.integers(min_value=0, max_value=1_000_000))
78+
rng = np.random.default_rng(seed)
79+
info = np.iinfo(np_dtype)
80+
# Avoid the extreme edge of the type range; some codecs reserve sentinels.
81+
arr = rng.integers(
82+
low=info.min // 2 if info.min < 0 else 0,
83+
high=info.max // 2,
84+
size=(height, width),
85+
dtype=np_dtype,
86+
)
87+
88+
da = xr.DataArray(arr, dims=('y', 'x'))
89+
return da, compression, tiled, predictor
90+
91+
92+
@st.composite
93+
def ifd_layout_inputs(draw):
94+
"""Generate a valid (or borderline) make_minimal_tiff invocation."""
95+
width = draw(st.integers(min_value=1, max_value=16))
96+
height = draw(st.integers(min_value=1, max_value=16))
97+
dtype = draw(st.sampled_from(['uint8', 'uint16', 'int16', 'float32']))
98+
compression = 1 # Uncompressed: make_minimal_tiff only supports type 1.
99+
tiled = draw(st.booleans())
100+
tile_size = draw(st.sampled_from([4, 8, 16]))
101+
big_endian = draw(st.booleans())
102+
with_geo = draw(st.booleans())
103+
104+
return dict(
105+
width=width,
106+
height=height,
107+
dtype=np.dtype(dtype),
108+
compression=compression,
109+
tiled=tiled,
110+
tile_size=tile_size,
111+
big_endian=big_endian,
112+
with_geo=with_geo,
113+
)
114+
115+
116+
# --- Group 1: round-trip property ---
117+
118+
@given(inputs=round_trip_inputs())
119+
@settings(
120+
max_examples=50,
121+
deadline=None,
122+
suppress_health_check=[HealthCheck.too_slow, HealthCheck.function_scoped_fixture],
123+
)
124+
def test_round_trip_property(tmp_path_factory, inputs):
125+
"""to_geotiff -> open_geotiff preserves array values bitwise."""
126+
da, compression, tiled, predictor = inputs
127+
128+
tmp_dir = tmp_path_factory.mktemp("fuzz_1661_rt")
129+
path = str(tmp_dir / "rt.tif")
130+
131+
to_geotiff(
132+
da,
133+
path,
134+
compression=compression,
135+
tiled=tiled,
136+
predictor=predictor,
137+
)
138+
139+
got = open_geotiff(path, dtype=str(da.dtype))
140+
141+
# Reader may add a leading band axis; squeeze for the 2D comparison.
142+
got_arr = got.values
143+
if got_arr.ndim == 3 and got_arr.shape[0] == 1:
144+
got_arr = got_arr[0]
145+
146+
np.testing.assert_array_equal(got_arr, da.values)
147+
148+
149+
# --- Group 2: IFD layout permutations ---
150+
151+
@given(spec=ifd_layout_inputs())
152+
@settings(
153+
max_examples=50,
154+
deadline=None,
155+
suppress_health_check=[HealthCheck.too_slow],
156+
)
157+
def test_ifd_layout_typed_errors_only(spec):
158+
"""make_minimal_tiff variations parse cleanly or raise a typed exception.
159+
160+
The reader is allowed to refuse any specific combination with a
161+
ValueError/TypeError; what is not allowed is a bare IndexError,
162+
struct.error, UnicodeDecodeError, or anything else that suggests we
163+
walked off the end of the byte buffer without checking.
164+
"""
165+
geo_transform = None
166+
epsg = None
167+
if spec['with_geo']:
168+
geo_transform = (-120.0, 45.0, 0.001, -0.001)
169+
epsg = 4326
170+
171+
tiff_bytes = make_minimal_tiff(
172+
width=spec['width'],
173+
height=spec['height'],
174+
dtype=spec['dtype'],
175+
compression=spec['compression'],
176+
tiled=spec['tiled'],
177+
tile_size=spec['tile_size'],
178+
big_endian=spec['big_endian'],
179+
geo_transform=geo_transform,
180+
epsg=epsg,
181+
)
182+
183+
try:
184+
da = open_geotiff(io.BytesIO(tiff_bytes))
185+
except ALLOWED_PARSE_EXCEPTIONS:
186+
return # Typed refusal -- acceptable.
187+
except Exception as exc:
188+
pytest.fail(
189+
f"open_geotiff raised non-typed {type(exc).__name__} on a "
190+
f"valid-by-construction TIFF: {spec!r} -> {exc!r}"
191+
)
192+
193+
# If it parsed, shape should match what we asked for. Reader may add a
194+
# leading band axis (samples=1), so check the last two dims.
195+
assert da.shape[-2:] == (spec['height'], spec['width']), (
196+
f"shape mismatch: got {da.shape}, expected last dims "
197+
f"({spec['height']}, {spec['width']}) for {spec!r}"
198+
)
199+
200+
201+
# --- Group 3: byte-level mutation fuzz ---
202+
203+
# Hold a single corpus TIFF and let Hypothesis pick a byte offset + new byte
204+
# value to splice in. Using a fixed corpus keeps the strategy fast (no
205+
# nested TIFF generation per example) and concentrates the search on the
206+
# parser's response to bit-rot.
207+
_CORPUS_SPECS = [
208+
# (kwargs to make_minimal_tiff, label)
209+
(dict(width=4, height=4, dtype=np.dtype('float32')), 'le_strip_f32'),
210+
(dict(width=4, height=4, dtype=np.dtype('uint16'), big_endian=True), 'be_strip_u16'),
211+
(dict(width=8, height=8, dtype=np.dtype('float32'), tiled=True, tile_size=4),
212+
'le_tiled_f32'),
213+
(dict(width=4, height=4, dtype=np.dtype('float32'),
214+
geo_transform=(-120.0, 45.0, 0.001, -0.001), epsg=4326),
215+
'le_geo_f32'),
216+
]
217+
_CORPUS = [(label, make_minimal_tiff(**kw)) for kw, label in _CORPUS_SPECS]
218+
219+
220+
@pytest.mark.parametrize("label,base_tiff", _CORPUS, ids=[lab for lab, _ in _CORPUS])
221+
# Regression seeds for bugs surfaced by the initial Hypothesis run on
222+
# the le_strip_f32 corpus member (4x4 float32, 198 bytes total):
223+
# offset 102, byte 0x00 -> ZeroDivisionError in _read_strips (rps=0)
224+
# offset 110, byte 0x00 -> IndexError in _read_strips (StripByteCounts trunc)
225+
# offset 122, byte 0x00 -> IndexError in sample_format (empty tuple)
226+
# These offsets are specific to the le_strip_f32 layout; the other corpus
227+
# entries will exercise the same code with different offsets, and that's
228+
# fine -- the example just guarantees we cover the regression each run.
229+
@example(offset_frac=102 / 198, new_byte=0x00)
230+
@example(offset_frac=110 / 198, new_byte=0x00)
231+
@example(offset_frac=122 / 198, new_byte=0x00)
232+
@given(
233+
offset_frac=st.floats(min_value=0.0, max_value=0.999),
234+
new_byte=st.integers(min_value=0, max_value=255),
235+
)
236+
@settings(
237+
max_examples=50,
238+
deadline=None,
239+
suppress_health_check=[HealthCheck.too_slow, HealthCheck.function_scoped_fixture],
240+
)
241+
def test_single_byte_mutation_typed_errors(label, base_tiff, offset_frac, new_byte):
242+
"""Flip one byte of a valid TIFF; reader must parse or raise typed exc.
243+
244+
The mutated file might still parse (the byte landed in pixel data, which
245+
is a valid value for that dtype). What is unacceptable is a bare
246+
``IndexError`` / ``struct.error`` from reading past the buffer, or a
247+
segfault from the GPU/dask paths -- those are kept off this test by
248+
using the eager numpy path only.
249+
"""
250+
mutated = bytearray(base_tiff)
251+
offset = int(offset_frac * len(mutated))
252+
# Make sure the mutation is actually a flip (not a no-op).
253+
if mutated[offset] == new_byte:
254+
new_byte = (new_byte + 1) & 0xFF
255+
mutated[offset] = new_byte
256+
257+
try:
258+
da = open_geotiff(io.BytesIO(bytes(mutated)))
259+
except ALLOWED_PARSE_EXCEPTIONS:
260+
return
261+
except (MemoryError, OverflowError):
262+
# Header field could decode to an absurd dimension/offset. We treat
263+
# these as acceptable refusals because the user gets a clear failure
264+
# rather than wrong data.
265+
return
266+
except Exception as exc:
267+
pytest.fail(
268+
f"[{label}] single-byte mutation at offset {offset} -> {new_byte:#x} "
269+
f"raised non-typed {type(exc).__name__}: {exc!r}"
270+
)
271+
272+
# If it parsed, the result must at least be a real DataArray with the
273+
# claimed dtype actually realised. Materialise to catch lazy errors.
274+
assert isinstance(da, xr.DataArray)
275+
_ = np.asarray(da.values)
276+
277+
278+
# --- Smoke test that the module wired itself up ---
279+
280+
def test_corpus_baseline_parses():
281+
"""Sanity check: every corpus TIFF parses without mutation."""
282+
for label, base in _CORPUS:
283+
da = open_geotiff(io.BytesIO(base))
284+
assert isinstance(da, xr.DataArray), label
285+
assert da.size > 0, label
286+
287+
288+
# --- Targeted regressions for bugs found by the property tests above ---
289+
# These three were caught by the byte-mutation property on first run and
290+
# fixed alongside this PR. They live here (not in a separate file) so the
291+
# regression context stays next to the harness that found them.
292+
293+
def test_regression_rows_per_strip_zero_is_typed_error():
294+
"""rps=0 must raise ValueError, not ZeroDivisionError."""
295+
base = make_minimal_tiff(4, 4, np.dtype('float32'))
296+
mut = bytearray(base)
297+
mut[102] = 0 # Zeroes the RowsPerStrip value in this layout.
298+
with pytest.raises(ValueError):
299+
open_geotiff(io.BytesIO(bytes(mut)))
300+
301+
302+
def test_regression_strip_table_truncated_is_typed_error():
303+
"""StripByteCounts shorter than strip count must raise ValueError."""
304+
base = make_minimal_tiff(4, 4, np.dtype('float32'))
305+
mut = bytearray(base)
306+
mut[110] = 0 # Truncates the strip table count in this layout.
307+
with pytest.raises(ValueError):
308+
open_geotiff(io.BytesIO(bytes(mut)))
309+
310+
311+
def test_regression_empty_sample_format_tuple_does_not_indexerror():
312+
"""SampleFormat tag with count=0 must fall back, not IndexError."""
313+
base = make_minimal_tiff(4, 4, np.dtype('float32'))
314+
mut = bytearray(base)
315+
mut[122] = 0 # Zeroes the SampleFormat count field in this layout.
316+
# Either parses with the default sample_format (1 = unsigned int) and
317+
# produces a DataArray, or fails downstream with a typed ValueError --
318+
# both are acceptable. The non-acceptable outcome is IndexError.
319+
try:
320+
da = open_geotiff(io.BytesIO(bytes(mut)))
321+
assert isinstance(da, xr.DataArray)
322+
except ValueError:
323+
pass

0 commit comments

Comments
 (0)