Skip to content

Commit 7cc65b2

Browse files
committed
Preserve GDALMetadata XML (tag 42112) through read/write
Band names, statistics, and other GDAL-specific metadata stored in the GDALMetadata XML tag (42112) are now read, exposed, and written back. Read: the XML is parsed into a dict stored in attrs['gdal_metadata']. Dataset-level items use string keys ('DataType'), per-band items use (name, band_int) tuple keys (('STATISTICS_MAXIMUM', 0)). The raw XML is also available in attrs['gdal_metadata_xml']. Write: accepts gdal_metadata_xml on write(), or extracts from DataArray attrs on write_geotiff(). If attrs has a gdal_metadata dict but no raw XML, it's re-serialized automatically. Round-trip verified on the USGS 1-meter DEM which has statistics, layer type, and data type metadata -- all items survive intact. 7 new tests: XML parsing, dict serialization, file round-trip, DataArray attrs preservation, no-metadata baseline, real-file read, and real-file round-trip.
1 parent cfaf93d commit 7cc65b2

File tree

5 files changed

+209
-1
lines changed

5 files changed

+209
-1
lines changed

xrspatial/geotiff/__init__.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,12 @@ def read_geotiff(source: str, *, window=None,
176176
if geo_info.vertical_units is not None:
177177
attrs['vertical_units'] = geo_info.vertical_units
178178

179+
# GDAL metadata (tag 42112)
180+
if geo_info.gdal_metadata is not None:
181+
attrs['gdal_metadata'] = geo_info.gdal_metadata
182+
if geo_info.gdal_metadata_xml is not None:
183+
attrs['gdal_metadata_xml'] = geo_info.gdal_metadata_xml
184+
179185
# Resolution / DPI metadata
180186
if geo_info.x_resolution is not None:
181187
attrs['x_resolution'] = geo_info.x_resolution
@@ -275,6 +281,7 @@ def write_geotiff(data: xr.DataArray | np.ndarray, path: str, *,
275281
x_res = None
276282
y_res = None
277283
res_unit = None
284+
gdal_meta_xml = None
278285

279286
# Resolve crs argument: can be int (EPSG) or str (WKT/PROJ)
280287
if isinstance(crs, int):
@@ -297,6 +304,13 @@ def write_geotiff(data: xr.DataArray | np.ndarray, path: str, *,
297304
nodata = data.attrs.get('nodata')
298305
if data.attrs.get('raster_type') == 'point':
299306
raster_type = RASTER_PIXEL_IS_POINT
307+
# GDAL metadata from attrs (prefer raw XML, fall back to dict)
308+
gdal_meta_xml = data.attrs.get('gdal_metadata_xml')
309+
if gdal_meta_xml is None:
310+
gdal_meta_dict = data.attrs.get('gdal_metadata')
311+
if isinstance(gdal_meta_dict, dict):
312+
from ._geotags import _build_gdal_metadata_xml
313+
gdal_meta_xml = _build_gdal_metadata_xml(gdal_meta_dict)
300314
# Resolution / DPI from attrs
301315
x_res = data.attrs.get('x_resolution')
302316
y_res = data.attrs.get('y_resolution')
@@ -326,6 +340,7 @@ def write_geotiff(data: xr.DataArray | np.ndarray, path: str, *,
326340
x_resolution=x_res,
327341
y_resolution=y_res,
328342
resolution_unit=res_unit,
343+
gdal_metadata_xml=gdal_meta_xml,
329344
)
330345

331346

xrspatial/geotiff/_geotags.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,10 +109,55 @@ class GeoInfo:
109109
vertical_units_code: int | None = None
110110
# WKT CRS string (resolved from EPSG via pyproj, or provided by caller)
111111
crs_wkt: str | None = None
112+
# GDAL metadata: dict of {name: value} for dataset-level items,
113+
# and {(name, band): value} for per-band items. Raw XML also kept.
114+
gdal_metadata: dict | None = None
115+
gdal_metadata_xml: str | None = None
112116
# Raw geokeys dict for anything else
113117
geokeys: dict[int, int | float | str] = field(default_factory=dict)
114118

115119

120+
def _parse_gdal_metadata(xml_str: str) -> dict:
121+
"""Parse GDALMetadata XML into a flat dict.
122+
123+
Dataset-level items are stored as ``{name: value}``.
124+
Per-band items are stored as ``{(name, band_int): value}``.
125+
"""
126+
import xml.etree.ElementTree as ET
127+
result = {}
128+
try:
129+
root = ET.fromstring(xml_str)
130+
for item in root.findall('Item'):
131+
name = item.get('name', '')
132+
sample = item.get('sample')
133+
text = item.text or ''
134+
if sample is not None:
135+
result[(name, int(sample))] = text
136+
else:
137+
result[name] = text
138+
except ET.ParseError:
139+
pass
140+
return result
141+
142+
143+
def _build_gdal_metadata_xml(meta: dict) -> str:
144+
"""Serialize a metadata dict back to GDALMetadata XML.
145+
146+
Accepts the same dict format that _parse_gdal_metadata produces:
147+
string keys for dataset-level, (name, band) tuples for per-band.
148+
"""
149+
lines = ['<GDALMetadata>']
150+
for key, value in meta.items():
151+
if isinstance(key, tuple):
152+
name, sample = key
153+
lines.append(
154+
f' <Item name="{name}" sample="{sample}">{value}</Item>')
155+
else:
156+
lines.append(f' <Item name="{key}">{value}</Item>')
157+
lines.append('</GDALMetadata>')
158+
return '\n'.join(lines) + '\n'
159+
160+
116161
def _epsg_to_wkt(epsg: int) -> str | None:
117162
"""Resolve an EPSG code to a WKT string using pyproj.
118163
@@ -379,6 +424,12 @@ def extract_geo_info(ifd: IFD, data: bytes | memoryview,
379424
except (ValueError, TypeError):
380425
pass
381426

427+
# Parse GDALMetadata XML (tag 42112)
428+
gdal_metadata = None
429+
gdal_metadata_xml = ifd.gdal_metadata
430+
if gdal_metadata_xml is not None:
431+
gdal_metadata = _parse_gdal_metadata(gdal_metadata_xml)
432+
382433
# Extract palette colormap (Photometric=3, tag 320)
383434
colormap = None
384435
if ifd.photometric == 3:
@@ -430,6 +481,8 @@ def extract_geo_info(ifd: IFD, data: bytes | memoryview,
430481
vertical_units=vert_units_name,
431482
vertical_units_code=vert_units_code,
432483
crs_wkt=crs_wkt,
484+
gdal_metadata=gdal_metadata,
485+
gdal_metadata_xml=gdal_metadata_xml,
433486
geokeys=geokeys,
434487
)
435488

xrspatial/geotiff/_header.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
TAG_TILE_BYTE_COUNTS = 325
3636
TAG_COLORMAP = 320
3737
TAG_SAMPLE_FORMAT = 339
38+
TAG_GDAL_METADATA = 42112
3839
TAG_GDAL_NODATA = 42113
3940

4041
# GeoTIFF tags
@@ -184,6 +185,16 @@ def colormap(self) -> tuple | None:
184185
"""ColorMap tag (320) values, or None if absent."""
185186
return self.get_values(TAG_COLORMAP)
186187

188+
@property
189+
def gdal_metadata(self) -> str | None:
190+
"""GDALMetadata XML string (tag 42112), or None if absent."""
191+
v = self.get_value(TAG_GDAL_METADATA)
192+
if v is None:
193+
return None
194+
if isinstance(v, bytes):
195+
return v.rstrip(b'\x00').decode('ascii', errors='replace')
196+
return str(v).rstrip('\x00')
197+
187198
@property
188199
def nodata_str(self) -> str | None:
189200
"""GDAL_NODATA tag value as string, or None."""

xrspatial/geotiff/_writer.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
TAG_TILE_OFFSETS,
5252
TAG_TILE_BYTE_COUNTS,
5353
TAG_PREDICTOR,
54+
TAG_GDAL_METADATA,
5455
)
5556

5657
# Byte order: always write little-endian
@@ -412,6 +413,7 @@ def _assemble_tiff(width: int, height: int, dtype: np.dtype,
412413
nodata,
413414
is_cog: bool = False,
414415
raster_type: int = 1,
416+
gdal_metadata_xml: str | None = None,
415417
x_resolution: float | None = None,
416418
y_resolution: float | None = None,
417419
resolution_unit: int | None = None) -> bytes:
@@ -516,6 +518,11 @@ def _assemble_tiff(width: int, height: int, dtype: np.dtype,
516518
elif gtag == TAG_GDAL_NODATA:
517519
tags.append((gtag, ASCII, len(str(gval)) + 1, str(gval)))
518520

521+
# GDALMetadata XML (tag 42112)
522+
if gdal_metadata_xml is not None:
523+
tags.append((TAG_GDAL_METADATA, ASCII,
524+
len(gdal_metadata_xml) + 1, gdal_metadata_xml))
525+
519526
ifd_specs.append(tags)
520527

521528
# --- Determine if BigTIFF is needed ---
@@ -703,7 +710,8 @@ def write(data: np.ndarray, path: str, *,
703710
raster_type: int = 1,
704711
x_resolution: float | None = None,
705712
y_resolution: float | None = None,
706-
resolution_unit: int | None = None) -> None:
713+
resolution_unit: int | None = None,
714+
gdal_metadata_xml: str | None = None) -> None:
707715
"""Write a numpy array as a GeoTIFF or COG.
708716
709717
Parameters
@@ -772,6 +780,7 @@ def write(data: np.ndarray, path: str, *,
772780
w, h, data.dtype, comp_tag, predictor, tiled, tile_size,
773781
parts, geo_transform, crs_epsg, nodata, is_cog=cog,
774782
raster_type=raster_type,
783+
gdal_metadata_xml=gdal_metadata_xml,
775784
x_resolution=x_resolution, y_resolution=y_resolution,
776785
resolution_unit=resolution_unit,
777786
)

xrspatial/geotiff/tests/test_features.py

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -413,6 +413,126 @@ def test_no_crs_no_wkt(self, tmp_path):
413413
# Resolution / DPI tags
414414
# -----------------------------------------------------------------------
415415

416+
# -----------------------------------------------------------------------
417+
# GDAL metadata (tag 42112)
418+
# -----------------------------------------------------------------------
419+
420+
class TestGDALMetadata:
421+
422+
def test_parse_gdal_metadata_xml(self):
423+
"""XML parsing extracts dataset and per-band items."""
424+
from xrspatial.geotiff._geotags import _parse_gdal_metadata
425+
xml = (
426+
'<GDALMetadata>\n'
427+
' <Item name="DataType">Generic</Item>\n'
428+
' <Item name="STATISTICS_MAX" sample="0">100.5</Item>\n'
429+
' <Item name="STATISTICS_MIN" sample="0">-5.2</Item>\n'
430+
' <Item name="BAND_NAME" sample="1">green</Item>\n'
431+
'</GDALMetadata>\n'
432+
)
433+
meta = _parse_gdal_metadata(xml)
434+
assert meta['DataType'] == 'Generic'
435+
assert meta[('STATISTICS_MAX', 0)] == '100.5'
436+
assert meta[('STATISTICS_MIN', 0)] == '-5.2'
437+
assert meta[('BAND_NAME', 1)] == 'green'
438+
439+
def test_build_gdal_metadata_xml(self):
440+
"""Dict serializes back to valid XML."""
441+
from xrspatial.geotiff._geotags import (
442+
_build_gdal_metadata_xml, _parse_gdal_metadata)
443+
meta = {
444+
'DataType': 'Generic',
445+
('STATS_MAX', 0): '42.0',
446+
('STATS_MIN', 0): '-1.0',
447+
}
448+
xml = _build_gdal_metadata_xml(meta)
449+
assert '<GDALMetadata>' in xml
450+
assert '<Item name="DataType">Generic</Item>' in xml
451+
assert 'sample="0"' in xml
452+
# Round-trip through parser
453+
reparsed = _parse_gdal_metadata(xml)
454+
assert reparsed == meta
455+
456+
def test_round_trip_via_file(self, tmp_path):
457+
"""GDAL metadata survives write -> read."""
458+
meta = {
459+
'DataType': 'Elevation',
460+
('STATISTICS_MAXIMUM', 0): '2500.0',
461+
('STATISTICS_MINIMUM', 0): '100.0',
462+
('STATISTICS_MEAN', 0): '1200.5',
463+
}
464+
from xrspatial.geotiff._geotags import _build_gdal_metadata_xml
465+
xml = _build_gdal_metadata_xml(meta)
466+
467+
arr = np.ones((4, 4), dtype=np.float32)
468+
path = str(tmp_path / 'gdal_meta.tif')
469+
write(arr, path, compression='none', tiled=False,
470+
gdal_metadata_xml=xml)
471+
472+
da = read_geotiff(path)
473+
assert 'gdal_metadata' in da.attrs
474+
assert 'gdal_metadata_xml' in da.attrs
475+
result_meta = da.attrs['gdal_metadata']
476+
assert result_meta['DataType'] == 'Elevation'
477+
assert result_meta[('STATISTICS_MAXIMUM', 0)] == '2500.0'
478+
assert result_meta[('STATISTICS_MEAN', 0)] == '1200.5'
479+
480+
def test_dataarray_attrs_round_trip(self, tmp_path):
481+
"""GDAL metadata from DataArray attrs is preserved."""
482+
meta = {'Source': 'test', ('BAND', 0): 'dem'}
483+
da = xr.DataArray(
484+
np.ones((4, 4), dtype=np.float32),
485+
dims=['y', 'x'],
486+
attrs={'gdal_metadata': meta},
487+
)
488+
path = str(tmp_path / 'da_meta.tif')
489+
write_geotiff(da, path, compression='none')
490+
491+
result = read_geotiff(path)
492+
assert result.attrs['gdal_metadata']['Source'] == 'test'
493+
assert result.attrs['gdal_metadata'][('BAND', 0)] == 'dem'
494+
495+
def test_no_metadata_no_attrs(self, tmp_path):
496+
"""Files without GDAL metadata don't get the attrs."""
497+
arr = np.ones((4, 4), dtype=np.float32)
498+
path = str(tmp_path / 'no_meta.tif')
499+
write(arr, path, compression='none', tiled=False)
500+
501+
da = read_geotiff(path)
502+
assert 'gdal_metadata' not in da.attrs
503+
assert 'gdal_metadata_xml' not in da.attrs
504+
505+
def test_real_file_metadata(self):
506+
"""Real USGS file has GDAL metadata with statistics."""
507+
import os
508+
path = '../rtxpy/examples/USGS_one_meter_x65y454_NY_LongIsland_Z18_2014.tif'
509+
if not os.path.exists(path):
510+
pytest.skip("Real test files not available")
511+
512+
da = read_geotiff(path)
513+
meta = da.attrs.get('gdal_metadata')
514+
assert meta is not None
515+
assert 'DataType' in meta
516+
assert ('STATISTICS_MAXIMUM', 0) in meta
517+
518+
def test_real_file_round_trip(self):
519+
"""GDAL metadata survives real-file round-trip."""
520+
import os, tempfile
521+
path = '../rtxpy/examples/USGS_one_meter_x65y454_NY_LongIsland_Z18_2014.tif'
522+
if not os.path.exists(path):
523+
pytest.skip("Real test files not available")
524+
525+
da = read_geotiff(path)
526+
orig_meta = da.attrs['gdal_metadata']
527+
528+
out = os.path.join(tempfile.mkdtemp(), 'rt.tif')
529+
write_geotiff(da, out, compression='deflate', tiled=False)
530+
531+
da2 = read_geotiff(out)
532+
for k, v in orig_meta.items():
533+
assert da2.attrs['gdal_metadata'].get(k) == v, f"Mismatch on {k}"
534+
535+
416536
class TestResolution:
417537

418538
def test_write_read_dpi(self, tmp_path):

0 commit comments

Comments
 (0)