Skip to content

Commit 51408f3

Browse files
authored
geotiff: surface VRT skipped sources as attrs['vrt_holes'] (#1734) (#1747)
1 parent f2db37a commit 51408f3

3 files changed

Lines changed: 217 additions & 1 deletion

File tree

xrspatial/geotiff/__init__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3508,6 +3508,16 @@ def read_vrt(source: str, *, dtype=None,
35083508
attrs['crs_wkt'] = vrt.crs_wkt
35093509
if vrt.raster_type == 'point':
35103510
attrs['raster_type'] = 'point'
3511+
# Surface skipped-source records as ``attrs['vrt_holes']`` so
3512+
# callers can detect a partial mosaic by attribute lookup. Under
3513+
# lenient mode (the default), the underlying ``_vrt.read_vrt``
3514+
# already warned per skipped source but the warning is easy to
3515+
# miss in a pipeline; the attr lets downstream code branch on
3516+
# ``"vrt_holes" in da.attrs`` instead of monitoring the warnings
3517+
# stream. Empty list is omitted so the attr only appears when
3518+
# there is actually a hole. See issue #1734.
3519+
if vrt.holes:
3520+
attrs['vrt_holes'] = list(vrt.holes)
35113521
# When a specific band is selected, source its nodata from that
35123522
# band's <NoDataValue> instead of band 0's. Otherwise multi-band
35133523
# VRTs with per-band sentinels would mis-mask the read: attrs would

xrspatial/geotiff/_vrt.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,14 @@ class VRTDataset:
181181
# coords must be emitted without the shift. Parsed from
182182
# ``<Metadata><MDI key="AREA_OR_POINT">Point</MDI></Metadata>``.
183183
raster_type: str = 'area' # 'area' or 'point'
184+
# Per-load record of sources skipped by ``read_vrt`` under the
185+
# lenient default (not strict mode). Each entry is a dict with
186+
# ``source``, ``band`` (1-based), ``dst_rect`` (xoff, yoff,
187+
# xsize, ysize), and ``error`` keys. Empty when no sources failed
188+
# to read. Populated by :func:`read_vrt` so callers can detect
189+
# holes by attribute lookup instead of parsing a UserWarning
190+
# message (issue #1734).
191+
holes: list[dict] = field(default_factory=list)
184192

185193

186194
def _parse_rect(elem) -> _Rect:
@@ -656,17 +664,37 @@ def read_vrt(vrt_path: str, *, window=None,
656664
# so partial mosaics are caught in CI. Default mode warns
657665
# once per missing source then continues, preserving the
658666
# historical behaviour. See issue #1662.
667+
#
668+
# The lenient default leaves zero-filled holes in
669+
# integer VRTs that downstream code cannot tell from
670+
# real data unless the band has a nodata sentinel set.
671+
# Recording the skipped source on ``vrt.holes`` lets the
672+
# public ``read_vrt`` surface a machine-readable
673+
# ``attrs['vrt_holes']`` entry on the returned
674+
# DataArray, alongside the warning. Callers that need
675+
# to fail loudly can either inspect that attr or set
676+
# ``XRSPATIAL_GEOTIFF_STRICT=1`` to raise here.
677+
# See issue #1734.
659678
import warnings
660679
from . import _geotiff_strict_mode, GeoTIFFFallbackWarning
661680
if _geotiff_strict_mode():
662681
raise
663682
warnings.warn(
664683
f"VRT source {src.filename!r} could not be read "
665684
f"({type(e).__name__}: {e}); skipping. The output "
666-
f"mosaic will have a hole at this tile.",
685+
f"mosaic will have a hole at this tile. Inspect "
686+
f"``DataArray.attrs['vrt_holes']`` or set "
687+
f"XRSPATIAL_GEOTIFF_STRICT=1 to raise instead.",
667688
GeoTIFFFallbackWarning,
668689
stacklevel=2,
669690
)
691+
vrt.holes.append({
692+
'source': src.filename,
693+
'band': src.band,
694+
'dst_rect': (dr.x_off, dr.y_off,
695+
dr.x_size, dr.y_size),
696+
'error': f"{type(e).__name__}: {e}",
697+
})
670698
continue # skip missing/unreadable sources
671699

672700
# Handle source nodata. Cast the sentinel to the *source*
Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
"""Regression test for issue #1734.
2+
3+
Under the lenient default (``XRSPATIAL_GEOTIFF_STRICT`` unset),
4+
``read_vrt`` warns once per unreadable source and continues, producing
5+
a mosaic with zero-filled holes for integer VRTs that downstream code
6+
cannot distinguish from real data. The warning is easy to miss in a
7+
pipeline that ignores ``UserWarning``s.
8+
9+
This module pins the new behaviour: the returned DataArray now carries
10+
an ``attrs['vrt_holes']`` list describing each skipped source so
11+
callers can detect a partial mosaic with a single attribute lookup.
12+
Strict mode is unchanged and still raises.
13+
"""
14+
from __future__ import annotations
15+
16+
import warnings
17+
18+
import pytest
19+
20+
from xrspatial.geotiff import GeoTIFFFallbackWarning, read_vrt
21+
22+
23+
@pytest.fixture
24+
def clear_strict_env(monkeypatch):
25+
monkeypatch.delenv('XRSPATIAL_GEOTIFF_STRICT', raising=False)
26+
27+
28+
@pytest.fixture
29+
def set_strict_env(monkeypatch):
30+
monkeypatch.setenv('XRSPATIAL_GEOTIFF_STRICT', '1')
31+
32+
33+
def _write_vrt_with_missing_source(vrt_path, missing_src) -> None:
34+
"""Write a VRT with an Int32 band whose only source is missing.
35+
36+
Integer ``dataType`` is the failure mode issue #1734 was about: the
37+
pre-fix lenient path zero-fills the output buffer (``fill = 0`` for
38+
integer dtypes) and the user cannot distinguish that hole from real
39+
zero-valued data. ``NoDataValue`` is omitted on purpose -- having
40+
one would let downstream code mask the hole and side-step the
41+
regression. See the module docstring.
42+
"""
43+
vrt_path.write_text(
44+
'<VRTDataset rasterXSize="4" rasterYSize="4">\n'
45+
' <SRS></SRS>\n'
46+
' <GeoTransform>0, 1, 0, 0, 0, -1</GeoTransform>\n'
47+
' <VRTRasterBand dataType="Int32" band="1">\n'
48+
' <SimpleSource>\n'
49+
f' <SourceFilename relativeToVRT="0">{missing_src}'
50+
'</SourceFilename>\n'
51+
' <SourceBand>1</SourceBand>\n'
52+
' <SrcRect xOff="0" yOff="0" xSize="4" ySize="4"/>\n'
53+
' <DstRect xOff="0" yOff="0" xSize="4" ySize="4"/>\n'
54+
' </SimpleSource>\n'
55+
' </VRTRasterBand>\n'
56+
'</VRTDataset>\n'
57+
)
58+
59+
60+
def test_skipped_source_records_vrt_holes_attr(
61+
clear_strict_env, tmp_path,
62+
):
63+
"""A VRT with a missing source returns a DataArray whose attrs
64+
carry a ``vrt_holes`` entry naming the source, band, dst_rect,
65+
and underlying error.
66+
67+
Uses an Int32 VRT so the hole is zero-filled (the exact failure
68+
mode #1734 was about): without the attr there is no way to tell
69+
the all-zeros tile from real data.
70+
"""
71+
import numpy as np
72+
73+
vrt_path = tmp_path / 'mosaic_1734_missing.vrt'
74+
missing_src = f'{tmp_path}/does_not_exist_1734.tif'
75+
_write_vrt_with_missing_source(vrt_path, missing_src)
76+
77+
with warnings.catch_warnings():
78+
warnings.simplefilter('ignore', GeoTIFFFallbackWarning)
79+
da = read_vrt(str(vrt_path))
80+
81+
# Confirm the integer-specific failure mode is in play: the hole is
82+
# filled with zeros (not NaN), indistinguishable from real data
83+
# without the attr.
84+
assert np.issubdtype(da.dtype, np.integer)
85+
assert (da.values == 0).all()
86+
87+
assert 'vrt_holes' in da.attrs
88+
holes = da.attrs['vrt_holes']
89+
assert isinstance(holes, list)
90+
assert len(holes) == 1
91+
h = holes[0]
92+
assert h['source'].endswith('does_not_exist_1734.tif')
93+
assert h['band'] == 1
94+
assert h['dst_rect'] == (0, 0, 4, 4)
95+
assert 'error' in h
96+
assert h['error'] # non-empty
97+
98+
99+
def test_no_holes_attr_when_all_sources_read(clear_strict_env, tmp_path):
100+
"""A successful VRT read does not advertise an empty ``vrt_holes``
101+
attr; the key is omitted entirely so ``"vrt_holes" in attrs`` is a
102+
cheap completeness check."""
103+
import numpy as np
104+
import xarray as xr
105+
from xrspatial.geotiff import to_geotiff
106+
107+
# Write a real source the VRT can reference.
108+
src_path = tmp_path / 'src_1734.tif'
109+
arr = np.arange(16, dtype=np.float32).reshape(4, 4)
110+
da_src = xr.DataArray(
111+
arr, dims=['y', 'x'],
112+
coords={'y': np.linspace(3.5, 0.5, 4),
113+
'x': np.linspace(0.5, 3.5, 4)},
114+
attrs={'crs': 4326},
115+
)
116+
to_geotiff(da_src, str(src_path), compression='none')
117+
118+
vrt_path = tmp_path / 'mosaic_1734_ok.vrt'
119+
vrt_path.write_text(
120+
'<VRTDataset rasterXSize="4" rasterYSize="4">\n'
121+
' <SRS></SRS>\n'
122+
' <GeoTransform>0, 1, 0, 0, 0, -1</GeoTransform>\n'
123+
' <VRTRasterBand dataType="Float32" band="1">\n'
124+
' <SimpleSource>\n'
125+
f' <SourceFilename relativeToVRT="0">{src_path}'
126+
'</SourceFilename>\n'
127+
' <SourceBand>1</SourceBand>\n'
128+
' <SrcRect xOff="0" yOff="0" xSize="4" ySize="4"/>\n'
129+
' <DstRect xOff="0" yOff="0" xSize="4" ySize="4"/>\n'
130+
' </SimpleSource>\n'
131+
' </VRTRasterBand>\n'
132+
'</VRTDataset>\n'
133+
)
134+
135+
with warnings.catch_warnings():
136+
warnings.simplefilter('error', GeoTIFFFallbackWarning)
137+
da = read_vrt(str(vrt_path))
138+
139+
assert 'vrt_holes' not in da.attrs
140+
141+
142+
def test_strict_mode_still_raises(set_strict_env, tmp_path):
143+
"""Strict mode is unchanged: the missing source surfaces the
144+
underlying ``FileNotFoundError`` (an ``OSError`` subclass) from
145+
``read_to_array`` instead of warning-and-skipping.
146+
147+
Asserting the concrete exception class -- not a bare ``Exception``
148+
-- keeps the regression test honest: an unrelated bug somewhere in
149+
the read path that happens to raise a different exception will
150+
fail this test instead of silently satisfying it.
151+
"""
152+
vrt_path = tmp_path / 'mosaic_1734_strict.vrt'
153+
missing_src = f'{tmp_path}/does_not_exist_1734_strict.tif'
154+
_write_vrt_with_missing_source(vrt_path, missing_src)
155+
156+
with pytest.raises(FileNotFoundError,
157+
match='does_not_exist_1734_strict.tif'):
158+
read_vrt(str(vrt_path))
159+
160+
161+
def test_warning_mentions_how_to_detect_holes(clear_strict_env, tmp_path):
162+
"""The fallback warning now points callers at the attr or the
163+
strict env var so the recovery path is discoverable from a single
164+
captured warning."""
165+
vrt_path = tmp_path / 'mosaic_1734_msg.vrt'
166+
missing_src = f'{tmp_path}/does_not_exist_1734_msg.tif'
167+
_write_vrt_with_missing_source(vrt_path, missing_src)
168+
169+
with warnings.catch_warnings(record=True) as w:
170+
warnings.simplefilter('always')
171+
read_vrt(str(vrt_path))
172+
173+
fallback = [
174+
x for x in w if issubclass(x.category, GeoTIFFFallbackWarning)
175+
]
176+
assert fallback, "expected at least one GeoTIFFFallbackWarning"
177+
msg = ' '.join(str(x.message) for x in fallback)
178+
assert 'vrt_holes' in msg or 'XRSPATIAL_GEOTIFF_STRICT' in msg

0 commit comments

Comments
 (0)