|
| 1 | +"""Hypothesis property and fuzz tests for the geotiff module (#1661). |
| 2 | +
|
| 3 | +Three property groups: |
| 4 | +
|
| 5 | +1. Round-trip: random valid (dtype, compression, tiled, predictor, nodata) -> |
| 6 | + write with ``to_geotiff`` -> read with ``open_geotiff`` -> assert array |
| 7 | + equality and attrs preservation. |
| 8 | +
|
| 9 | +2. IFD layout permutations via ``make_minimal_tiff``: assert ``open_geotiff`` |
| 10 | + returns a valid array, or raises ``ValueError`` / ``TypeError`` from the |
| 11 | + geotiff module. Never bare ``IndexError`` / ``struct.error`` / |
| 12 | + ``UnicodeDecodeError``. |
| 13 | +
|
| 14 | +3. Single-byte mutation: flip one byte in a valid TIFF at a Hypothesis-chosen |
| 15 | + offset. Reader must either parse consistently or raise a typed exception. |
| 16 | +
|
| 17 | +The whole file is skipped if ``hypothesis`` is not installed -- it is not a |
| 18 | +hard test dep yet (see issue #1661 unresolved questions). Each test bounds |
| 19 | +example count and disables Hypothesis's deadline so CI variance doesn't |
| 20 | +flake. |
| 21 | +""" |
| 22 | +from __future__ import annotations |
| 23 | + |
| 24 | +import io |
| 25 | +import struct |
| 26 | + |
| 27 | +import numpy as np |
| 28 | +import pytest |
| 29 | +import xarray as xr |
| 30 | + |
| 31 | +hypothesis = pytest.importorskip("hypothesis") |
| 32 | +from hypothesis import HealthCheck, example, given, settings # noqa: E402 |
| 33 | +from hypothesis import strategies as st # noqa: E402 |
| 34 | + |
| 35 | +from xrspatial.geotiff import open_geotiff, to_geotiff # noqa: E402 |
| 36 | + |
| 37 | +from .conftest import make_minimal_tiff # noqa: E402 |
| 38 | + |
| 39 | + |
| 40 | +# Exception types the geotiff module is allowed to raise on invalid input. |
| 41 | +# Any other exception class indicates an undocumented failure mode -- either |
| 42 | +# the strategy generated something we should reject explicitly, or there's |
| 43 | +# a real bug. |
| 44 | +ALLOWED_PARSE_EXCEPTIONS = (ValueError, TypeError) |
| 45 | + |
| 46 | +# Codecs safe for round-trip on every dtype in our strategy. 'jpeg' is |
| 47 | +# explicitly rejected on write (see _VALID_COMPRESSIONS docstring); 'lerc' and |
| 48 | +# 'jpeg2000' are lossy or dtype-restricted and would need their own narrower |
| 49 | +# strategies, so they're omitted here. |
| 50 | +LOSSLESS_CODECS = ['none', 'deflate', 'lzw', 'packbits', 'zstd', 'lz4'] |
| 51 | + |
| 52 | +# Dtype set kept small to keep CI fast. Float and int, signed and unsigned. |
| 53 | +ROUND_TRIP_DTYPES = ['uint8', 'uint16', 'int16', 'int32', 'float32', 'float64'] |
| 54 | + |
| 55 | + |
| 56 | +# --- Strategies --- |
| 57 | + |
| 58 | +@st.composite |
| 59 | +def round_trip_inputs(draw): |
| 60 | + """Generate (DataArray, compression, tiled, predictor) for round-trip.""" |
| 61 | + width = draw(st.integers(min_value=1, max_value=32)) |
| 62 | + height = draw(st.integers(min_value=1, max_value=32)) |
| 63 | + dtype = draw(st.sampled_from(ROUND_TRIP_DTYPES)) |
| 64 | + compression = draw(st.sampled_from(LOSSLESS_CODECS)) |
| 65 | + tiled = draw(st.booleans()) |
| 66 | + |
| 67 | + np_dtype = np.dtype(dtype) |
| 68 | + if np_dtype.kind == 'f': |
| 69 | + # Predictor 3 is for floats only; 0/1 means no predictor. |
| 70 | + predictor = draw(st.sampled_from([False, 3])) |
| 71 | + data = draw(st.integers(min_value=0, max_value=1_000_000)) |
| 72 | + rng = np.random.default_rng(data) |
| 73 | + arr = rng.standard_normal((height, width)).astype(np_dtype) |
| 74 | + else: |
| 75 | + # Predictor 2 is horizontal differencing, good for ints. |
| 76 | + predictor = draw(st.sampled_from([False, 2])) |
| 77 | + seed = draw(st.integers(min_value=0, max_value=1_000_000)) |
| 78 | + rng = np.random.default_rng(seed) |
| 79 | + info = np.iinfo(np_dtype) |
| 80 | + # Avoid the extreme edge of the type range; some codecs reserve sentinels. |
| 81 | + arr = rng.integers( |
| 82 | + low=info.min // 2 if info.min < 0 else 0, |
| 83 | + high=info.max // 2, |
| 84 | + size=(height, width), |
| 85 | + dtype=np_dtype, |
| 86 | + ) |
| 87 | + |
| 88 | + da = xr.DataArray(arr, dims=('y', 'x')) |
| 89 | + return da, compression, tiled, predictor |
| 90 | + |
| 91 | + |
| 92 | +@st.composite |
| 93 | +def ifd_layout_inputs(draw): |
| 94 | + """Generate a valid (or borderline) make_minimal_tiff invocation.""" |
| 95 | + width = draw(st.integers(min_value=1, max_value=16)) |
| 96 | + height = draw(st.integers(min_value=1, max_value=16)) |
| 97 | + dtype = draw(st.sampled_from(['uint8', 'uint16', 'int16', 'float32'])) |
| 98 | + compression = 1 # Uncompressed: make_minimal_tiff only supports type 1. |
| 99 | + tiled = draw(st.booleans()) |
| 100 | + tile_size = draw(st.sampled_from([4, 8, 16])) |
| 101 | + big_endian = draw(st.booleans()) |
| 102 | + with_geo = draw(st.booleans()) |
| 103 | + |
| 104 | + return dict( |
| 105 | + width=width, |
| 106 | + height=height, |
| 107 | + dtype=np.dtype(dtype), |
| 108 | + compression=compression, |
| 109 | + tiled=tiled, |
| 110 | + tile_size=tile_size, |
| 111 | + big_endian=big_endian, |
| 112 | + with_geo=with_geo, |
| 113 | + ) |
| 114 | + |
| 115 | + |
| 116 | +# --- Group 1: round-trip property --- |
| 117 | + |
| 118 | +@given(inputs=round_trip_inputs()) |
| 119 | +@settings( |
| 120 | + max_examples=50, |
| 121 | + deadline=None, |
| 122 | + suppress_health_check=[HealthCheck.too_slow, HealthCheck.function_scoped_fixture], |
| 123 | +) |
| 124 | +def test_round_trip_property(tmp_path_factory, inputs): |
| 125 | + """to_geotiff -> open_geotiff preserves array values bitwise.""" |
| 126 | + da, compression, tiled, predictor = inputs |
| 127 | + |
| 128 | + tmp_dir = tmp_path_factory.mktemp("fuzz_1661_rt") |
| 129 | + path = str(tmp_dir / "rt.tif") |
| 130 | + |
| 131 | + to_geotiff( |
| 132 | + da, |
| 133 | + path, |
| 134 | + compression=compression, |
| 135 | + tiled=tiled, |
| 136 | + predictor=predictor, |
| 137 | + ) |
| 138 | + |
| 139 | + got = open_geotiff(path, dtype=str(da.dtype)) |
| 140 | + |
| 141 | + # Reader may add a leading band axis; squeeze for the 2D comparison. |
| 142 | + got_arr = got.values |
| 143 | + if got_arr.ndim == 3 and got_arr.shape[0] == 1: |
| 144 | + got_arr = got_arr[0] |
| 145 | + |
| 146 | + np.testing.assert_array_equal(got_arr, da.values) |
| 147 | + |
| 148 | + |
| 149 | +# --- Group 2: IFD layout permutations --- |
| 150 | + |
| 151 | +@given(spec=ifd_layout_inputs()) |
| 152 | +@settings( |
| 153 | + max_examples=50, |
| 154 | + deadline=None, |
| 155 | + suppress_health_check=[HealthCheck.too_slow], |
| 156 | +) |
| 157 | +def test_ifd_layout_typed_errors_only(spec): |
| 158 | + """make_minimal_tiff variations parse cleanly or raise a typed exception. |
| 159 | +
|
| 160 | + The reader is allowed to refuse any specific combination with a |
| 161 | + ValueError/TypeError; what is not allowed is a bare IndexError, |
| 162 | + struct.error, UnicodeDecodeError, or anything else that suggests we |
| 163 | + walked off the end of the byte buffer without checking. |
| 164 | + """ |
| 165 | + geo_transform = None |
| 166 | + epsg = None |
| 167 | + if spec['with_geo']: |
| 168 | + geo_transform = (-120.0, 45.0, 0.001, -0.001) |
| 169 | + epsg = 4326 |
| 170 | + |
| 171 | + tiff_bytes = make_minimal_tiff( |
| 172 | + width=spec['width'], |
| 173 | + height=spec['height'], |
| 174 | + dtype=spec['dtype'], |
| 175 | + compression=spec['compression'], |
| 176 | + tiled=spec['tiled'], |
| 177 | + tile_size=spec['tile_size'], |
| 178 | + big_endian=spec['big_endian'], |
| 179 | + geo_transform=geo_transform, |
| 180 | + epsg=epsg, |
| 181 | + ) |
| 182 | + |
| 183 | + try: |
| 184 | + da = open_geotiff(io.BytesIO(tiff_bytes)) |
| 185 | + except ALLOWED_PARSE_EXCEPTIONS: |
| 186 | + return # Typed refusal -- acceptable. |
| 187 | + except Exception as exc: |
| 188 | + pytest.fail( |
| 189 | + f"open_geotiff raised non-typed {type(exc).__name__} on a " |
| 190 | + f"valid-by-construction TIFF: {spec!r} -> {exc!r}" |
| 191 | + ) |
| 192 | + |
| 193 | + # If it parsed, shape should match what we asked for. Reader may add a |
| 194 | + # leading band axis (samples=1), so check the last two dims. |
| 195 | + assert da.shape[-2:] == (spec['height'], spec['width']), ( |
| 196 | + f"shape mismatch: got {da.shape}, expected last dims " |
| 197 | + f"({spec['height']}, {spec['width']}) for {spec!r}" |
| 198 | + ) |
| 199 | + |
| 200 | + |
| 201 | +# --- Group 3: byte-level mutation fuzz --- |
| 202 | + |
| 203 | +# Hold a single corpus TIFF and let Hypothesis pick a byte offset + new byte |
| 204 | +# value to splice in. Using a fixed corpus keeps the strategy fast (no |
| 205 | +# nested TIFF generation per example) and concentrates the search on the |
| 206 | +# parser's response to bit-rot. |
| 207 | +_CORPUS_SPECS = [ |
| 208 | + # (kwargs to make_minimal_tiff, label) |
| 209 | + (dict(width=4, height=4, dtype=np.dtype('float32')), 'le_strip_f32'), |
| 210 | + (dict(width=4, height=4, dtype=np.dtype('uint16'), big_endian=True), 'be_strip_u16'), |
| 211 | + (dict(width=8, height=8, dtype=np.dtype('float32'), tiled=True, tile_size=4), |
| 212 | + 'le_tiled_f32'), |
| 213 | + (dict(width=4, height=4, dtype=np.dtype('float32'), |
| 214 | + geo_transform=(-120.0, 45.0, 0.001, -0.001), epsg=4326), |
| 215 | + 'le_geo_f32'), |
| 216 | +] |
| 217 | +_CORPUS = [(label, make_minimal_tiff(**kw)) for kw, label in _CORPUS_SPECS] |
| 218 | + |
| 219 | + |
| 220 | +@pytest.mark.parametrize("label,base_tiff", _CORPUS, ids=[lab for lab, _ in _CORPUS]) |
| 221 | +# Regression seeds for bugs surfaced by the initial Hypothesis run on |
| 222 | +# the le_strip_f32 corpus member (4x4 float32, 198 bytes total): |
| 223 | +# offset 102, byte 0x00 -> ZeroDivisionError in _read_strips (rps=0) |
| 224 | +# offset 110, byte 0x00 -> IndexError in _read_strips (StripByteCounts trunc) |
| 225 | +# offset 122, byte 0x00 -> IndexError in sample_format (empty tuple) |
| 226 | +# These offsets are specific to the le_strip_f32 layout; the other corpus |
| 227 | +# entries will exercise the same code with different offsets, and that's |
| 228 | +# fine -- the example just guarantees we cover the regression each run. |
| 229 | +@example(offset_frac=102 / 198, new_byte=0x00) |
| 230 | +@example(offset_frac=110 / 198, new_byte=0x00) |
| 231 | +@example(offset_frac=122 / 198, new_byte=0x00) |
| 232 | +@given( |
| 233 | + offset_frac=st.floats(min_value=0.0, max_value=0.999), |
| 234 | + new_byte=st.integers(min_value=0, max_value=255), |
| 235 | +) |
| 236 | +@settings( |
| 237 | + max_examples=50, |
| 238 | + deadline=None, |
| 239 | + suppress_health_check=[HealthCheck.too_slow, HealthCheck.function_scoped_fixture], |
| 240 | +) |
| 241 | +def test_single_byte_mutation_typed_errors(label, base_tiff, offset_frac, new_byte): |
| 242 | + """Flip one byte of a valid TIFF; reader must parse or raise typed exc. |
| 243 | +
|
| 244 | + The mutated file might still parse (the byte landed in pixel data, which |
| 245 | + is a valid value for that dtype). What is unacceptable is a bare |
| 246 | + ``IndexError`` / ``struct.error`` from reading past the buffer, or a |
| 247 | + segfault from the GPU/dask paths -- those are kept off this test by |
| 248 | + using the eager numpy path only. |
| 249 | + """ |
| 250 | + mutated = bytearray(base_tiff) |
| 251 | + offset = int(offset_frac * len(mutated)) |
| 252 | + # Make sure the mutation is actually a flip (not a no-op). |
| 253 | + if mutated[offset] == new_byte: |
| 254 | + new_byte = (new_byte + 1) & 0xFF |
| 255 | + mutated[offset] = new_byte |
| 256 | + |
| 257 | + try: |
| 258 | + da = open_geotiff(io.BytesIO(bytes(mutated))) |
| 259 | + except ALLOWED_PARSE_EXCEPTIONS: |
| 260 | + return |
| 261 | + except (MemoryError, OverflowError): |
| 262 | + # Header field could decode to an absurd dimension/offset. We treat |
| 263 | + # these as acceptable refusals because the user gets a clear failure |
| 264 | + # rather than wrong data. |
| 265 | + return |
| 266 | + except Exception as exc: |
| 267 | + pytest.fail( |
| 268 | + f"[{label}] single-byte mutation at offset {offset} -> {new_byte:#x} " |
| 269 | + f"raised non-typed {type(exc).__name__}: {exc!r}" |
| 270 | + ) |
| 271 | + |
| 272 | + # If it parsed, the result must at least be a real DataArray with the |
| 273 | + # claimed dtype actually realised. Materialise to catch lazy errors. |
| 274 | + assert isinstance(da, xr.DataArray) |
| 275 | + _ = np.asarray(da.values) |
| 276 | + |
| 277 | + |
| 278 | +# --- Smoke test that the module wired itself up --- |
| 279 | + |
| 280 | +def test_corpus_baseline_parses(): |
| 281 | + """Sanity check: every corpus TIFF parses without mutation.""" |
| 282 | + for label, base in _CORPUS: |
| 283 | + da = open_geotiff(io.BytesIO(base)) |
| 284 | + assert isinstance(da, xr.DataArray), label |
| 285 | + assert da.size > 0, label |
| 286 | + |
| 287 | + |
| 288 | +# --- Targeted regressions for bugs found by the property tests above --- |
| 289 | +# These three were caught by the byte-mutation property on first run and |
| 290 | +# fixed alongside this PR. They live here (not in a separate file) so the |
| 291 | +# regression context stays next to the harness that found them. |
| 292 | + |
| 293 | +def test_regression_rows_per_strip_zero_is_typed_error(): |
| 294 | + """rps=0 must raise ValueError, not ZeroDivisionError.""" |
| 295 | + base = make_minimal_tiff(4, 4, np.dtype('float32')) |
| 296 | + mut = bytearray(base) |
| 297 | + mut[102] = 0 # Zeroes the RowsPerStrip value in this layout. |
| 298 | + with pytest.raises(ValueError): |
| 299 | + open_geotiff(io.BytesIO(bytes(mut))) |
| 300 | + |
| 301 | + |
| 302 | +def test_regression_strip_table_truncated_is_typed_error(): |
| 303 | + """StripByteCounts shorter than strip count must raise ValueError.""" |
| 304 | + base = make_minimal_tiff(4, 4, np.dtype('float32')) |
| 305 | + mut = bytearray(base) |
| 306 | + mut[110] = 0 # Truncates the strip table count in this layout. |
| 307 | + with pytest.raises(ValueError): |
| 308 | + open_geotiff(io.BytesIO(bytes(mut))) |
| 309 | + |
| 310 | + |
| 311 | +def test_regression_empty_sample_format_tuple_does_not_indexerror(): |
| 312 | + """SampleFormat tag with count=0 must fall back, not IndexError.""" |
| 313 | + base = make_minimal_tiff(4, 4, np.dtype('float32')) |
| 314 | + mut = bytearray(base) |
| 315 | + mut[122] = 0 # Zeroes the SampleFormat count field in this layout. |
| 316 | + # Either parses with the default sample_format (1 = unsigned int) and |
| 317 | + # produces a DataArray, or fails downstream with a typed ValueError -- |
| 318 | + # both are acceptable. The non-acceptable outcome is IndexError. |
| 319 | + try: |
| 320 | + da = open_geotiff(io.BytesIO(bytes(mut))) |
| 321 | + assert isinstance(da, xr.DataArray) |
| 322 | + except ValueError: |
| 323 | + pass |
0 commit comments