|
| 1 | +"""Regression tests for issue #1657: dangerous tag leakage through extra_tags. |
| 2 | +
|
| 3 | +Before the fix, reading an overview level (NewSubfileType=1) or any TIFF |
| 4 | +with a SubIFDs entry leaked those tags into ``attrs['extra_tags']`` because |
| 5 | +they were not in ``_MANAGED_TAGS``. Writing the DataArray back via |
| 6 | +``to_geotiff`` or ``write_geotiff_gpu`` then re-emitted them on the output |
| 7 | +IFD, producing: |
| 8 | +
|
| 9 | +* A primary IFD wrongly marked as a reduced-resolution overview |
| 10 | + (``NewSubfileType=1``), so GDAL / rasterio skip it when picking the |
| 11 | + primary image. |
| 12 | +* Stale absolute byte offsets in ``SubIFDs`` that point into the new file's |
| 13 | + pixel data, crashing readers that follow the chain. |
| 14 | +
|
| 15 | +The fix adds both tags to ``_MANAGED_TAGS`` (read-side filter) and to |
| 16 | +``_DANGEROUS_EXTRA_TAG_IDS`` in ``_writer.py`` (write-side belt-and-braces |
| 17 | +guard so caller-supplied ``attrs['extra_tags']`` still produces a clean |
| 18 | +file). |
| 19 | +
|
| 20 | +These tests pin the contract on every available backend. |
| 21 | +""" |
| 22 | +from __future__ import annotations |
| 23 | + |
| 24 | +import importlib.util |
| 25 | + |
| 26 | +import numpy as np |
| 27 | +import pytest |
| 28 | + |
| 29 | +from xrspatial.geotiff import open_geotiff, to_geotiff |
| 30 | + |
| 31 | +tifffile = pytest.importorskip("tifffile") |
| 32 | + |
| 33 | + |
| 34 | +def _gpu_available() -> bool: |
| 35 | + if importlib.util.find_spec("cupy") is None: |
| 36 | + return False |
| 37 | + try: |
| 38 | + import cupy |
| 39 | + return bool(cupy.cuda.is_available()) |
| 40 | + except Exception: |
| 41 | + return False |
| 42 | + |
| 43 | + |
| 44 | +_HAS_GPU = _gpu_available() |
| 45 | +_gpu_only = pytest.mark.skipif(not _HAS_GPU, reason="cupy + CUDA required") |
| 46 | + |
| 47 | + |
| 48 | +def _make_cog(path) -> None: |
| 49 | + """Write a small COG with overviews so each backend can read an overview. |
| 50 | +
|
| 51 | + Uses tile_size=64 + explicit overview_levels so the small test fixture |
| 52 | + actually produces a multi-IFD pyramid (auto-overview halves until the |
| 53 | + smallest level fits in one tile, so 256x256 with the default 256-tile |
| 54 | + size yields zero overviews). |
| 55 | + """ |
| 56 | + import xarray as xr |
| 57 | + da = xr.DataArray( |
| 58 | + np.arange(512 * 512, dtype=np.float32).reshape(512, 512), |
| 59 | + dims=['y', 'x'], |
| 60 | + coords={'y': np.arange(512) * -0.5 + 10.0, |
| 61 | + 'x': np.arange(512) * 0.5 - 10.0}, |
| 62 | + attrs={'crs': 4326}, |
| 63 | + ) |
| 64 | + to_geotiff(da, str(path), cog=True, |
| 65 | + tile_size=64, overview_levels=[2, 4]) |
| 66 | + |
| 67 | + |
| 68 | +# --------------------------------------------------------------------------- |
| 69 | +# Read side: overview reads must not leak NewSubfileType into attrs |
| 70 | +# --------------------------------------------------------------------------- |
| 71 | + |
| 72 | +def test_overview_read_does_not_leak_newsubfiletype_numpy(tmp_path): |
| 73 | + """Reading an overview level must not surface NewSubfileType in attrs.""" |
| 74 | + cog_path = tmp_path / 'cog.tif' |
| 75 | + _make_cog(cog_path) |
| 76 | + ov = open_geotiff(str(cog_path), overview_level=1) |
| 77 | + extra = ov.attrs.get('extra_tags') |
| 78 | + assert extra is None or not any(t[0] == 254 for t in extra), ( |
| 79 | + f"NewSubfileType (tag 254) leaked into attrs['extra_tags']: {extra}" |
| 80 | + ) |
| 81 | + |
| 82 | + |
| 83 | +def test_overview_read_does_not_leak_newsubfiletype_dask(tmp_path): |
| 84 | + """Same contract for the dask backend.""" |
| 85 | + cog_path = tmp_path / 'cog.tif' |
| 86 | + _make_cog(cog_path) |
| 87 | + ov = open_geotiff(str(cog_path), overview_level=1, chunks=32) |
| 88 | + extra = ov.attrs.get('extra_tags') |
| 89 | + assert extra is None or not any(t[0] == 254 for t in extra), ( |
| 90 | + f"NewSubfileType (tag 254) leaked under dask: {extra}" |
| 91 | + ) |
| 92 | + |
| 93 | + |
| 94 | +@_gpu_only |
| 95 | +def test_overview_read_does_not_leak_newsubfiletype_cupy(tmp_path): |
| 96 | + """Same contract for the cupy backend.""" |
| 97 | + cog_path = tmp_path / 'cog.tif' |
| 98 | + _make_cog(cog_path) |
| 99 | + ov = open_geotiff(str(cog_path), overview_level=1, gpu=True) |
| 100 | + extra = ov.attrs.get('extra_tags') |
| 101 | + assert extra is None or not any(t[0] == 254 for t in extra), ( |
| 102 | + f"NewSubfileType (tag 254) leaked under cupy: {extra}" |
| 103 | + ) |
| 104 | + |
| 105 | + |
| 106 | +@_gpu_only |
| 107 | +def test_overview_read_does_not_leak_newsubfiletype_dask_cupy(tmp_path): |
| 108 | + """Same contract for the dask+cupy backend.""" |
| 109 | + cog_path = tmp_path / 'cog.tif' |
| 110 | + _make_cog(cog_path) |
| 111 | + ov = open_geotiff( |
| 112 | + str(cog_path), overview_level=1, gpu=True, chunks=32) |
| 113 | + extra = ov.attrs.get('extra_tags') |
| 114 | + assert extra is None or not any(t[0] == 254 for t in extra), ( |
| 115 | + f"NewSubfileType (tag 254) leaked under dask+cupy: {extra}" |
| 116 | + ) |
| 117 | + |
| 118 | + |
| 119 | +# --------------------------------------------------------------------------- |
| 120 | +# Read side: SubIFDs must not leak from level-0 IFDs that carry one |
| 121 | +# --------------------------------------------------------------------------- |
| 122 | + |
| 123 | +def test_subifds_does_not_leak_into_attrs(tmp_path): |
| 124 | + """tifffile writes SubIFDs by default on multi-page TIFFs. |
| 125 | +
|
| 126 | + Anything carrying tag 330 must not surface in ``attrs['extra_tags']`` |
| 127 | + because the byte offsets are file-absolute and cannot be replayed |
| 128 | + into a rewritten file. |
| 129 | + """ |
| 130 | + data = np.arange(64 * 64, dtype=np.float32).reshape(64, 64) |
| 131 | + path = tmp_path / 'subifd.tif' |
| 132 | + with tifffile.TiffWriter(str(path)) as tw: |
| 133 | + tw.write(data, tile=(32, 32), subifds=1) |
| 134 | + tw.write(data[::2, ::2], tile=(32, 32), subfiletype=1) |
| 135 | + |
| 136 | + da = open_geotiff(str(path)) |
| 137 | + extra = da.attrs.get('extra_tags') |
| 138 | + assert extra is None or not any(t[0] == 330 for t in extra), ( |
| 139 | + f"SubIFDs (tag 330) leaked into attrs['extra_tags']: {extra}" |
| 140 | + ) |
| 141 | + |
| 142 | + |
| 143 | +# --------------------------------------------------------------------------- |
| 144 | +# Round-trip: overview read -> write must produce a valid primary IFD |
| 145 | +# --------------------------------------------------------------------------- |
| 146 | + |
| 147 | +def _read_subfile_type(path) -> int | None: |
| 148 | + """Return the NewSubfileType value on page 0 of a TIFF, or None if absent.""" |
| 149 | + with tifffile.TiffFile(str(path)) as tf: |
| 150 | + page = tf.pages[0] |
| 151 | + tag = page.tags.get('NewSubfileType') |
| 152 | + return None if tag is None else int(tag.value) |
| 153 | + |
| 154 | + |
| 155 | +def test_overview_roundtrip_primary_ifd_clean_numpy(tmp_path): |
| 156 | + """Round-tripping an overview must not mark the output as an overview.""" |
| 157 | + cog_path = tmp_path / 'cog.tif' |
| 158 | + _make_cog(cog_path) |
| 159 | + ov = open_geotiff(str(cog_path), overview_level=1) |
| 160 | + out = tmp_path / 'out_numpy.tif' |
| 161 | + to_geotiff(ov, str(out)) |
| 162 | + sft = _read_subfile_type(out) |
| 163 | + assert sft in (None, 0), ( |
| 164 | + f"Round-tripped overview produced NewSubfileType={sft} on the " |
| 165 | + f"primary IFD (expected None or 0)." |
| 166 | + ) |
| 167 | + |
| 168 | + |
| 169 | +def test_overview_roundtrip_primary_ifd_clean_dask(tmp_path): |
| 170 | + """Same contract for the dask read path.""" |
| 171 | + cog_path = tmp_path / 'cog.tif' |
| 172 | + _make_cog(cog_path) |
| 173 | + ov = open_geotiff(str(cog_path), overview_level=1, chunks=32) |
| 174 | + out = tmp_path / 'out_dask.tif' |
| 175 | + to_geotiff(ov, str(out)) |
| 176 | + sft = _read_subfile_type(out) |
| 177 | + assert sft in (None, 0), ( |
| 178 | + f"Dask round-tripped overview produced NewSubfileType={sft}." |
| 179 | + ) |
| 180 | + |
| 181 | + |
| 182 | +@_gpu_only |
| 183 | +def test_overview_roundtrip_primary_ifd_clean_cupy(tmp_path): |
| 184 | + """Same contract for the cupy read + write path.""" |
| 185 | + cog_path = tmp_path / 'cog.tif' |
| 186 | + _make_cog(cog_path) |
| 187 | + ov = open_geotiff(str(cog_path), overview_level=1, gpu=True) |
| 188 | + out = tmp_path / 'out_cupy.tif' |
| 189 | + to_geotiff(ov, str(out)) # auto-dispatches to GPU writer |
| 190 | + sft = _read_subfile_type(out) |
| 191 | + assert sft in (None, 0), ( |
| 192 | + f"Cupy round-tripped overview produced NewSubfileType={sft}." |
| 193 | + ) |
| 194 | + |
| 195 | + |
| 196 | +# --------------------------------------------------------------------------- |
| 197 | +# Writer belt-and-braces: caller-supplied dangerous extra_tags get filtered |
| 198 | +# --------------------------------------------------------------------------- |
| 199 | + |
| 200 | +def test_writer_filters_caller_supplied_newsubfiletype(tmp_path): |
| 201 | + """A caller passing extra_tags with NewSubfileType=1 still gets a |
| 202 | + clean primary IFD on output.""" |
| 203 | + import xarray as xr |
| 204 | + da = xr.DataArray( |
| 205 | + np.zeros((32, 32), dtype=np.float32), |
| 206 | + dims=['y', 'x'], |
| 207 | + coords={'y': np.arange(32) * -0.5 + 10.0, |
| 208 | + 'x': np.arange(32) * 0.5 - 10.0}, |
| 209 | + attrs={ |
| 210 | + 'crs': 4326, |
| 211 | + # Tag 254 with LONG (type id 4) count 1 value 1 -- format |
| 212 | + # produced by the read path before the fix. |
| 213 | + 'extra_tags': [(254, 4, 1, 1)], |
| 214 | + }, |
| 215 | + ) |
| 216 | + out = tmp_path / 'with_dangerous_extra_tag.tif' |
| 217 | + to_geotiff(da, str(out)) |
| 218 | + sft = _read_subfile_type(out) |
| 219 | + assert sft in (None, 0), ( |
| 220 | + f"Writer accepted dangerous extra_tags[254]={sft}, expected None/0." |
| 221 | + ) |
| 222 | + |
| 223 | + |
| 224 | +def test_writer_filters_caller_supplied_subifds(tmp_path): |
| 225 | + """A caller passing extra_tags with SubIFDs offsets must not emit |
| 226 | + those stale offsets onto the output file.""" |
| 227 | + import xarray as xr |
| 228 | + da = xr.DataArray( |
| 229 | + np.zeros((32, 32), dtype=np.float32), |
| 230 | + dims=['y', 'x'], |
| 231 | + coords={'y': np.arange(32) * -0.5 + 10.0, |
| 232 | + 'x': np.arange(32) * 0.5 - 10.0}, |
| 233 | + attrs={ |
| 234 | + 'crs': 4326, |
| 235 | + # Garbage offsets that would crash readers if emitted. |
| 236 | + 'extra_tags': [(330, 4, 2, (999999, 888888))], |
| 237 | + }, |
| 238 | + ) |
| 239 | + out = tmp_path / 'with_subifds.tif' |
| 240 | + to_geotiff(da, str(out)) |
| 241 | + with tifffile.TiffFile(str(out)) as tf: |
| 242 | + sub = tf.pages[0].tags.get('SubIFDs') |
| 243 | + assert sub is None, ( |
| 244 | + f"Writer emitted SubIFDs={sub.value}, should have filtered it." |
| 245 | + ) |
| 246 | + |
| 247 | + |
| 248 | +def test_writer_keeps_benign_extra_tags(tmp_path): |
| 249 | + """The filter must only drop the dangerous IDs (254, 330) and let |
| 250 | + other extra_tags entries through. Use Software (305) -- ASCII, no |
| 251 | + embedded offsets -- as a benign canary.""" |
| 252 | + import xarray as xr |
| 253 | + da = xr.DataArray( |
| 254 | + np.zeros((32, 32), dtype=np.float32), |
| 255 | + dims=['y', 'x'], |
| 256 | + coords={'y': np.arange(32) * -0.5 + 10.0, |
| 257 | + 'x': np.arange(32) * 0.5 - 10.0}, |
| 258 | + attrs={ |
| 259 | + 'crs': 4326, |
| 260 | + 'extra_tags': [ |
| 261 | + (254, 4, 1, 1), # dangerous, must be filtered |
| 262 | + (305, 2, 12, 'tifffile.py'), # benign, must survive |
| 263 | + ], |
| 264 | + }, |
| 265 | + ) |
| 266 | + out = tmp_path / 'mixed_extra_tags.tif' |
| 267 | + to_geotiff(da, str(out)) |
| 268 | + with tifffile.TiffFile(str(out)) as tf: |
| 269 | + page = tf.pages[0] |
| 270 | + assert page.tags.get('NewSubfileType') is None |
| 271 | + software = page.tags.get('Software') |
| 272 | + assert software is not None, ( |
| 273 | + "Benign extra_tag (305 Software) was filtered too -- " |
| 274 | + "filter is too aggressive." |
| 275 | + ) |
| 276 | + assert 'tifffile' in str(software.value) |
0 commit comments