Skip to content

Commit e6d86c8

Browse files
committed
geotiff: surface VRT skipped sources as attrs['vrt_holes'] (#1734)
The default lenient mode of read_vrt warns once per unreadable source and continues, returning a mosaic with zero-filled holes that downstream integer-VRT code cannot distinguish from real data. The warning is easy to miss in a pipeline that filters UserWarnings. Add a structured side-channel: VRTDataset gains a .holes list that the read loop populates with {source, band, dst_rect, error} on each skipped source. The public read_vrt copies a non-empty list into the returned DataArray as attrs['vrt_holes'] so callers can detect partial mosaics by a single attribute lookup. The fallback warning message now points at attrs['vrt_holes'] or the XRSPATIAL_GEOTIFF_STRICT env var, so the recovery path is discoverable from a single captured warning. Strict mode (XRSPATIAL_GEOTIFF_STRICT=1) is unchanged and still raises. Adds regression coverage in test_vrt_holes_attr_1734.py.
1 parent 77e6e4e commit e6d86c8

3 files changed

Lines changed: 188 additions & 1 deletion

File tree

xrspatial/geotiff/__init__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3474,6 +3474,16 @@ def read_vrt(source: str, *, dtype=None,
34743474
attrs['crs_wkt'] = vrt.crs_wkt
34753475
if vrt.raster_type == 'point':
34763476
attrs['raster_type'] = 'point'
3477+
# Surface skipped-source records as ``attrs['vrt_holes']`` so
3478+
# callers can detect a partial mosaic by attribute lookup. Under
3479+
# lenient mode (the default), the underlying ``_vrt.read_vrt``
3480+
# already warned per skipped source but the warning is easy to
3481+
# miss in a pipeline; the attr lets downstream code branch on
3482+
# ``"vrt_holes" in da.attrs`` instead of monitoring the warnings
3483+
# stream. Empty list is omitted so the attr only appears when
3484+
# there is actually a hole. See issue #1734.
3485+
if vrt.holes:
3486+
attrs['vrt_holes'] = list(vrt.holes)
34773487
# When a specific band is selected, source its nodata from that
34783488
# band's <NoDataValue> instead of band 0's. Otherwise multi-band
34793489
# VRTs with per-band sentinels would mis-mask the read: attrs would

xrspatial/geotiff/_vrt.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,14 @@ class VRTDataset:
181181
# coords must be emitted without the shift. Parsed from
182182
# ``<Metadata><MDI key="AREA_OR_POINT">Point</MDI></Metadata>``.
183183
raster_type: str = 'area' # 'area' or 'point'
184+
# Per-load record of sources skipped by ``read_vrt`` under the
185+
# lenient default (not strict mode). Each entry is a dict with
186+
# ``source``, ``band`` (1-based), ``dst_rect`` (xoff, yoff,
187+
# xsize, ysize), and ``error`` keys. Empty when no sources failed
188+
# to read. Populated by :func:`read_vrt` so callers can detect
189+
# holes by attribute lookup instead of parsing a UserWarning
190+
# message (issue #1734).
191+
holes: list[dict] = field(default_factory=list)
184192

185193

186194
def _parse_rect(elem) -> _Rect:
@@ -656,17 +664,37 @@ def read_vrt(vrt_path: str, *, window=None,
656664
# so partial mosaics are caught in CI. Default mode warns
657665
# once per missing source then continues, preserving the
658666
# historical behaviour. See issue #1662.
667+
#
668+
# The lenient default leaves zero-filled holes in
669+
# integer VRTs that downstream code cannot tell from
670+
# real data unless the band has a nodata sentinel set.
671+
# Recording the skipped source on ``vrt.holes`` lets the
672+
# public ``read_vrt`` surface a machine-readable
673+
# ``attrs['vrt_holes']`` entry on the returned
674+
# DataArray, alongside the warning. Callers that need
675+
# to fail loudly can either inspect that attr or set
676+
# ``XRSPATIAL_GEOTIFF_STRICT=1`` to raise here.
677+
# See issue #1734.
659678
import warnings
660679
from . import _geotiff_strict_mode, GeoTIFFFallbackWarning
661680
if _geotiff_strict_mode():
662681
raise
663682
warnings.warn(
664683
f"VRT source {src.filename!r} could not be read "
665684
f"({type(e).__name__}: {e}); skipping. The output "
666-
f"mosaic will have a hole at this tile.",
685+
f"mosaic will have a hole at this tile. Inspect "
686+
f"``DataArray.attrs['vrt_holes']`` or set "
687+
f"XRSPATIAL_GEOTIFF_STRICT=1 to raise instead.",
667688
GeoTIFFFallbackWarning,
668689
stacklevel=2,
669690
)
691+
vrt.holes.append({
692+
'source': src.filename,
693+
'band': src.band,
694+
'dst_rect': (dr.x_off, dr.y_off,
695+
dr.x_size, dr.y_size),
696+
'error': f"{type(e).__name__}: {e}",
697+
})
670698
continue # skip missing/unreadable sources
671699

672700
# Handle source nodata. Cast the sentinel to the *source*
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
"""Regression test for issue #1734.
2+
3+
Under the lenient default (``XRSPATIAL_GEOTIFF_STRICT`` unset),
4+
``read_vrt`` warns once per unreadable source and continues, producing
5+
a mosaic with zero-filled holes for integer VRTs that downstream code
6+
cannot distinguish from real data. The warning is easy to miss in a
7+
pipeline that ignores ``UserWarning``s.
8+
9+
This module pins the new behaviour: the returned DataArray now carries
10+
an ``attrs['vrt_holes']`` list describing each skipped source so
11+
callers can detect a partial mosaic with a single attribute lookup.
12+
Strict mode is unchanged and still raises.
13+
"""
14+
from __future__ import annotations
15+
16+
import warnings
17+
18+
import pytest
19+
20+
from xrspatial.geotiff import GeoTIFFFallbackWarning, read_vrt
21+
22+
23+
@pytest.fixture
24+
def clear_strict_env(monkeypatch):
25+
monkeypatch.delenv('XRSPATIAL_GEOTIFF_STRICT', raising=False)
26+
27+
28+
@pytest.fixture
29+
def set_strict_env(monkeypatch):
30+
monkeypatch.setenv('XRSPATIAL_GEOTIFF_STRICT', '1')
31+
32+
33+
def _write_vrt_with_missing_source(vrt_path, missing_src) -> None:
34+
vrt_path.write_text(
35+
'<VRTDataset rasterXSize="4" rasterYSize="4">\n'
36+
' <SRS></SRS>\n'
37+
' <GeoTransform>0, 1, 0, 0, 0, -1</GeoTransform>\n'
38+
' <VRTRasterBand dataType="Float32" band="1">\n'
39+
' <NoDataValue>-9999</NoDataValue>\n'
40+
' <SimpleSource>\n'
41+
f' <SourceFilename relativeToVRT="0">{missing_src}'
42+
'</SourceFilename>\n'
43+
' <SourceBand>1</SourceBand>\n'
44+
' <SrcRect xOff="0" yOff="0" xSize="4" ySize="4"/>\n'
45+
' <DstRect xOff="0" yOff="0" xSize="4" ySize="4"/>\n'
46+
' </SimpleSource>\n'
47+
' </VRTRasterBand>\n'
48+
'</VRTDataset>\n'
49+
)
50+
51+
52+
def test_skipped_source_records_vrt_holes_attr(
53+
clear_strict_env, tmp_path,
54+
):
55+
"""A VRT with a missing source returns a DataArray whose attrs
56+
carry a ``vrt_holes`` entry naming the source, band, dst_rect,
57+
and underlying error."""
58+
vrt_path = tmp_path / 'mosaic_1734_missing.vrt'
59+
missing_src = f'{tmp_path}/does_not_exist_1734.tif'
60+
_write_vrt_with_missing_source(vrt_path, missing_src)
61+
62+
with warnings.catch_warnings():
63+
warnings.simplefilter('ignore', GeoTIFFFallbackWarning)
64+
da = read_vrt(str(vrt_path))
65+
66+
assert 'vrt_holes' in da.attrs
67+
holes = da.attrs['vrt_holes']
68+
assert isinstance(holes, list)
69+
assert len(holes) == 1
70+
h = holes[0]
71+
assert h['source'].endswith('does_not_exist_1734.tif')
72+
assert h['band'] == 1
73+
assert h['dst_rect'] == (0, 0, 4, 4)
74+
assert 'error' in h
75+
assert h['error'] # non-empty
76+
77+
78+
def test_no_holes_attr_when_all_sources_read(clear_strict_env, tmp_path):
79+
"""A successful VRT read does not advertise an empty ``vrt_holes``
80+
attr; the key is omitted entirely so ``"vrt_holes" in attrs`` is a
81+
cheap completeness check."""
82+
import numpy as np
83+
import xarray as xr
84+
from xrspatial.geotiff import to_geotiff
85+
86+
# Write a real source the VRT can reference.
87+
src_path = tmp_path / 'src_1734.tif'
88+
arr = np.arange(16, dtype=np.float32).reshape(4, 4)
89+
da_src = xr.DataArray(
90+
arr, dims=['y', 'x'],
91+
coords={'y': np.linspace(3.5, 0.5, 4),
92+
'x': np.linspace(0.5, 3.5, 4)},
93+
attrs={'crs': 4326},
94+
)
95+
to_geotiff(da_src, str(src_path), compression='none')
96+
97+
vrt_path = tmp_path / 'mosaic_1734_ok.vrt'
98+
vrt_path.write_text(
99+
'<VRTDataset rasterXSize="4" rasterYSize="4">\n'
100+
' <SRS></SRS>\n'
101+
' <GeoTransform>0, 1, 0, 0, 0, -1</GeoTransform>\n'
102+
' <VRTRasterBand dataType="Float32" band="1">\n'
103+
' <SimpleSource>\n'
104+
f' <SourceFilename relativeToVRT="0">{src_path}'
105+
'</SourceFilename>\n'
106+
' <SourceBand>1</SourceBand>\n'
107+
' <SrcRect xOff="0" yOff="0" xSize="4" ySize="4"/>\n'
108+
' <DstRect xOff="0" yOff="0" xSize="4" ySize="4"/>\n'
109+
' </SimpleSource>\n'
110+
' </VRTRasterBand>\n'
111+
'</VRTDataset>\n'
112+
)
113+
114+
with warnings.catch_warnings():
115+
warnings.simplefilter('error', GeoTIFFFallbackWarning)
116+
da = read_vrt(str(vrt_path))
117+
118+
assert 'vrt_holes' not in da.attrs
119+
120+
121+
def test_strict_mode_still_raises(set_strict_env, tmp_path):
122+
"""Strict mode is unchanged: the missing source raises and no
123+
DataArray is returned."""
124+
vrt_path = tmp_path / 'mosaic_1734_strict.vrt'
125+
missing_src = f'{tmp_path}/does_not_exist_1734_strict.tif'
126+
_write_vrt_with_missing_source(vrt_path, missing_src)
127+
128+
with pytest.raises(Exception):
129+
read_vrt(str(vrt_path))
130+
131+
132+
def test_warning_mentions_how_to_detect_holes(clear_strict_env, tmp_path):
133+
"""The fallback warning now points callers at the attr or the
134+
strict env var so the recovery path is discoverable from a single
135+
captured warning."""
136+
vrt_path = tmp_path / 'mosaic_1734_msg.vrt'
137+
missing_src = f'{tmp_path}/does_not_exist_1734_msg.tif'
138+
_write_vrt_with_missing_source(vrt_path, missing_src)
139+
140+
with warnings.catch_warnings(record=True) as w:
141+
warnings.simplefilter('always')
142+
read_vrt(str(vrt_path))
143+
144+
fallback = [
145+
x for x in w if issubclass(x.category, GeoTIFFFallbackWarning)
146+
]
147+
assert fallback, "expected at least one GeoTIFFFallbackWarning"
148+
msg = ' '.join(str(x.message) for x in fallback)
149+
assert 'vrt_holes' in msg or 'XRSPATIAL_GEOTIFF_STRICT' in msg

0 commit comments

Comments
 (0)