Skip to content

Commit cc77511

Browse files
committed
Preserve arbitrary TIFF tags through read/write round-trip
Any IFD tag that the writer doesn't explicitly manage (Software, DateTime, ImageDescription, Copyright, custom private tags, etc.) is now collected on read, stored in attrs['extra_tags'], and re-emitted on write. Read: extract_geo_info collects (tag_id, type_id, count, value) tuples for all tags not in the _MANAGED_TAGS set (structural tags that the writer builds from scratch: dimensions, compression, offsets, geo tags, etc.). Stored in attrs['extra_tags']. Write: extra_tags are appended to the IFD, skipping any tag_id that was already written to avoid duplicates. The tag values are serialized using the same type-aware encoder as built-in tags. Tested with a hand-crafted TIFF containing Software (305) and DateTime (306) tags. Both survive read -> write -> read intact. 3 new tests: read detection, round-trip preservation, and no-extra-tags baseline.
1 parent 7cc65b2 commit cc77511

4 files changed

Lines changed: 199 additions & 7 deletions

File tree

xrspatial/geotiff/__init__.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,10 @@ def read_geotiff(source: str, *, window=None,
182182
if geo_info.gdal_metadata_xml is not None:
183183
attrs['gdal_metadata_xml'] = geo_info.gdal_metadata_xml
184184

185+
# Extra (non-managed) TIFF tags for pass-through
186+
if geo_info.extra_tags is not None:
187+
attrs['extra_tags'] = geo_info.extra_tags
188+
185189
# Resolution / DPI metadata
186190
if geo_info.x_resolution is not None:
187191
attrs['x_resolution'] = geo_info.x_resolution
@@ -282,6 +286,7 @@ def write_geotiff(data: xr.DataArray | np.ndarray, path: str, *,
282286
y_res = None
283287
res_unit = None
284288
gdal_meta_xml = None
289+
extra_tags_list = None
285290

286291
# Resolve crs argument: can be int (EPSG) or str (WKT/PROJ)
287292
if isinstance(crs, int):
@@ -311,6 +316,8 @@ def write_geotiff(data: xr.DataArray | np.ndarray, path: str, *,
311316
if isinstance(gdal_meta_dict, dict):
312317
from ._geotags import _build_gdal_metadata_xml
313318
gdal_meta_xml = _build_gdal_metadata_xml(gdal_meta_dict)
319+
# Extra tags for pass-through
320+
extra_tags_list = data.attrs.get('extra_tags')
314321
# Resolution / DPI from attrs
315322
x_res = data.attrs.get('x_resolution')
316323
y_res = data.attrs.get('y_resolution')
@@ -341,6 +348,7 @@ def write_geotiff(data: xr.DataArray | np.ndarray, path: str, *,
341348
y_resolution=y_res,
342349
resolution_unit=res_unit,
343350
gdal_metadata_xml=gdal_meta_xml,
351+
extra_tags=extra_tags_list,
344352
)
345353

346354

xrspatial/geotiff/_geotags.py

Lines changed: 41 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,38 @@
66

77
from ._header import (
88
IFD,
9-
TAG_MODEL_PIXEL_SCALE,
10-
TAG_MODEL_TIEPOINT,
9+
TAG_IMAGE_WIDTH, TAG_IMAGE_LENGTH, TAG_BITS_PER_SAMPLE,
10+
TAG_COMPRESSION, TAG_PHOTOMETRIC,
11+
TAG_STRIP_OFFSETS, TAG_SAMPLES_PER_PIXEL,
12+
TAG_ROWS_PER_STRIP, TAG_STRIP_BYTE_COUNTS,
13+
TAG_X_RESOLUTION, TAG_Y_RESOLUTION,
14+
TAG_PLANAR_CONFIG, TAG_RESOLUTION_UNIT,
15+
TAG_PREDICTOR, TAG_COLORMAP,
16+
TAG_TILE_WIDTH, TAG_TILE_LENGTH,
17+
TAG_TILE_OFFSETS, TAG_TILE_BYTE_COUNTS,
18+
TAG_SAMPLE_FORMAT, TAG_GDAL_METADATA, TAG_GDAL_NODATA,
19+
TAG_MODEL_PIXEL_SCALE, TAG_MODEL_TIEPOINT,
1120
TAG_MODEL_TRANSFORMATION,
12-
TAG_GEO_KEY_DIRECTORY,
13-
TAG_GEO_DOUBLE_PARAMS,
14-
TAG_GEO_ASCII_PARAMS,
15-
TAG_GDAL_NODATA,
21+
TAG_GEO_KEY_DIRECTORY, TAG_GEO_DOUBLE_PARAMS, TAG_GEO_ASCII_PARAMS,
1622
)
1723

24+
# Tags that the writer manages -- everything else can be passed through
25+
_MANAGED_TAGS = frozenset({
26+
TAG_IMAGE_WIDTH, TAG_IMAGE_LENGTH, TAG_BITS_PER_SAMPLE,
27+
TAG_COMPRESSION, TAG_PHOTOMETRIC,
28+
TAG_STRIP_OFFSETS, TAG_SAMPLES_PER_PIXEL,
29+
TAG_ROWS_PER_STRIP, TAG_STRIP_BYTE_COUNTS,
30+
TAG_X_RESOLUTION, TAG_Y_RESOLUTION,
31+
TAG_PLANAR_CONFIG, TAG_RESOLUTION_UNIT,
32+
TAG_PREDICTOR, TAG_COLORMAP,
33+
TAG_TILE_WIDTH, TAG_TILE_LENGTH,
34+
TAG_TILE_OFFSETS, TAG_TILE_BYTE_COUNTS,
35+
TAG_SAMPLE_FORMAT, TAG_GDAL_METADATA, TAG_GDAL_NODATA,
36+
TAG_MODEL_PIXEL_SCALE, TAG_MODEL_TIEPOINT,
37+
TAG_MODEL_TRANSFORMATION,
38+
TAG_GEO_KEY_DIRECTORY, TAG_GEO_DOUBLE_PARAMS, TAG_GEO_ASCII_PARAMS,
39+
})
40+
1841
# GeoKey IDs
1942
GEOKEY_MODEL_TYPE = 1024
2043
GEOKEY_RASTER_TYPE = 1025
@@ -113,6 +136,9 @@ class GeoInfo:
113136
# and {(name, band): value} for per-band items. Raw XML also kept.
114137
gdal_metadata: dict | None = None
115138
gdal_metadata_xml: str | None = None
139+
# Extra TIFF tags not managed by the writer (pass-through on round-trip)
140+
# List of (tag_id, type_id, count, raw_value) tuples.
141+
extra_tags: list | None = None
116142
# Raw geokeys dict for anything else
117143
geokeys: dict[int, int | float | str] = field(default_factory=dict)
118144

@@ -450,6 +476,14 @@ def extract_geo_info(ifd: IFD, data: bytes | memoryview,
450476
b = raw_cmap[2 * n_colors + i] / 65535.0
451477
colormap.append((r, g, b, 1.0))
452478

479+
# Collect extra (non-managed) tags for pass-through
480+
extra_tags = []
481+
for tag_id, entry in ifd.entries.items():
482+
if tag_id not in _MANAGED_TAGS:
483+
extra_tags.append((tag_id, entry.type_id, entry.count, entry.value))
484+
if not extra_tags:
485+
extra_tags = None
486+
453487
# Resolve EPSG -> WKT via pyproj if available
454488
crs_wkt = None
455489
if epsg is not None:
@@ -483,6 +517,7 @@ def extract_geo_info(ifd: IFD, data: bytes | memoryview,
483517
crs_wkt=crs_wkt,
484518
gdal_metadata=gdal_metadata,
485519
gdal_metadata_xml=gdal_metadata_xml,
520+
extra_tags=extra_tags,
486521
geokeys=geokeys,
487522
)
488523

xrspatial/geotiff/_writer.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -414,6 +414,7 @@ def _assemble_tiff(width: int, height: int, dtype: np.dtype,
414414
is_cog: bool = False,
415415
raster_type: int = 1,
416416
gdal_metadata_xml: str | None = None,
417+
extra_tags: list | None = None,
417418
x_resolution: float | None = None,
418419
y_resolution: float | None = None,
419420
resolution_unit: int | None = None) -> bytes:
@@ -523,6 +524,14 @@ def _assemble_tiff(width: int, height: int, dtype: np.dtype,
523524
tags.append((TAG_GDAL_METADATA, ASCII,
524525
len(gdal_metadata_xml) + 1, gdal_metadata_xml))
525526

527+
# Extra tags (pass-through from source file)
528+
if extra_tags is not None:
529+
for etag_id, etype_id, ecount, evalue in extra_tags:
530+
# Skip any tag we already wrote to avoid duplicates
531+
existing_ids = {t[0] for t in tags}
532+
if etag_id not in existing_ids:
533+
tags.append((etag_id, etype_id, ecount, evalue))
534+
526535
ifd_specs.append(tags)
527536

528537
# --- Determine if BigTIFF is needed ---
@@ -711,7 +720,8 @@ def write(data: np.ndarray, path: str, *,
711720
x_resolution: float | None = None,
712721
y_resolution: float | None = None,
713722
resolution_unit: int | None = None,
714-
gdal_metadata_xml: str | None = None) -> None:
723+
gdal_metadata_xml: str | None = None,
724+
extra_tags: list | None = None) -> None:
715725
"""Write a numpy array as a GeoTIFF or COG.
716726
717727
Parameters
@@ -781,6 +791,7 @@ def write(data: np.ndarray, path: str, *,
781791
parts, geo_transform, crs_epsg, nodata, is_cog=cog,
782792
raster_type=raster_type,
783793
gdal_metadata_xml=gdal_metadata_xml,
794+
extra_tags=extra_tags,
784795
x_resolution=x_resolution, y_resolution=y_resolution,
785796
resolution_unit=resolution_unit,
786797
)

xrspatial/geotiff/tests/test_features.py

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -417,6 +417,144 @@ def test_no_crs_no_wkt(self, tmp_path):
417417
# GDAL metadata (tag 42112)
418418
# -----------------------------------------------------------------------
419419

420+
# -----------------------------------------------------------------------
421+
# Arbitrary tag preservation
422+
# -----------------------------------------------------------------------
423+
424+
class TestExtraTags:
425+
426+
def _make_tiff_with_extra_tags(self, tmp_path):
427+
"""Build a TIFF with Software (305) and DateTime (306) tags."""
428+
import struct
429+
bo = '<'
430+
width, height = 4, 4
431+
pixels = np.arange(16, dtype=np.float32).reshape(4, 4)
432+
pixel_bytes = pixels.tobytes()
433+
434+
tag_list = []
435+
def add_short(tag, val):
436+
tag_list.append((tag, 3, 1, struct.pack(f'{bo}H', val)))
437+
def add_long(tag, val):
438+
tag_list.append((tag, 4, 1, struct.pack(f'{bo}I', val)))
439+
def add_ascii(tag, text):
440+
raw = text.encode('ascii') + b'\x00'
441+
tag_list.append((tag, 2, len(raw), raw))
442+
443+
add_short(256, width)
444+
add_short(257, height)
445+
add_short(258, 32)
446+
add_short(259, 1)
447+
add_short(262, 1)
448+
add_short(277, 1)
449+
add_short(278, height)
450+
add_long(273, 0) # placeholder
451+
add_long(279, len(pixel_bytes))
452+
add_short(339, 3) # float
453+
add_ascii(305, 'TestSoftware v1.0')
454+
add_ascii(306, '2025:01:15 12:00:00')
455+
456+
tag_list.sort(key=lambda t: t[0])
457+
num_entries = len(tag_list)
458+
ifd_start = 8
459+
ifd_size = 2 + 12 * num_entries + 4
460+
overflow_start = ifd_start + ifd_size
461+
462+
overflow_buf = bytearray()
463+
tag_offsets = {}
464+
for tag, typ, count, raw in tag_list:
465+
if len(raw) > 4:
466+
tag_offsets[tag] = len(overflow_buf)
467+
overflow_buf.extend(raw)
468+
if len(overflow_buf) % 2:
469+
overflow_buf.append(0)
470+
else:
471+
tag_offsets[tag] = None
472+
473+
pixel_data_start = overflow_start + len(overflow_buf)
474+
475+
patched = []
476+
for tag, typ, count, raw in tag_list:
477+
if tag == 273:
478+
patched.append((tag, typ, count, struct.pack(f'{bo}I', pixel_data_start)))
479+
else:
480+
patched.append((tag, typ, count, raw))
481+
tag_list = patched
482+
483+
overflow_buf = bytearray()
484+
tag_offsets = {}
485+
for tag, typ, count, raw in tag_list:
486+
if len(raw) > 4:
487+
tag_offsets[tag] = len(overflow_buf)
488+
overflow_buf.extend(raw)
489+
if len(overflow_buf) % 2:
490+
overflow_buf.append(0)
491+
else:
492+
tag_offsets[tag] = None
493+
494+
out = bytearray()
495+
out.extend(b'II')
496+
out.extend(struct.pack(f'{bo}H', 42))
497+
out.extend(struct.pack(f'{bo}I', ifd_start))
498+
out.extend(struct.pack(f'{bo}H', num_entries))
499+
for tag, typ, count, raw in tag_list:
500+
out.extend(struct.pack(f'{bo}HHI', tag, typ, count))
501+
if len(raw) <= 4:
502+
out.extend(raw.ljust(4, b'\x00'))
503+
else:
504+
ptr = overflow_start + tag_offsets[tag]
505+
out.extend(struct.pack(f'{bo}I', ptr))
506+
out.extend(struct.pack(f'{bo}I', 0))
507+
out.extend(overflow_buf)
508+
out.extend(pixel_bytes)
509+
510+
path = str(tmp_path / 'extra_tags.tif')
511+
with open(path, 'wb') as f:
512+
f.write(bytes(out))
513+
return path, pixels
514+
515+
def test_extra_tags_read(self, tmp_path):
516+
"""Extra tags are collected in attrs['extra_tags']."""
517+
path, _ = self._make_tiff_with_extra_tags(tmp_path)
518+
da = read_geotiff(path)
519+
520+
extra = da.attrs.get('extra_tags')
521+
assert extra is not None
522+
tag_ids = {t[0] for t in extra}
523+
assert 305 in tag_ids # Software
524+
assert 306 in tag_ids # DateTime
525+
526+
def test_extra_tags_round_trip(self, tmp_path):
527+
"""Extra tags survive read -> write -> read."""
528+
path, pixels = self._make_tiff_with_extra_tags(tmp_path)
529+
da = read_geotiff(path)
530+
531+
out_path = str(tmp_path / 'roundtrip.tif')
532+
write_geotiff(da, out_path, compression='none')
533+
534+
da2 = read_geotiff(out_path)
535+
536+
# Pixels should match
537+
np.testing.assert_array_equal(da2.values, pixels)
538+
539+
# Extra tags should survive
540+
extra2 = da2.attrs.get('extra_tags')
541+
assert extra2 is not None
542+
tag_map = {t[0]: t[3] for t in extra2}
543+
assert 305 in tag_map
544+
assert 'TestSoftware v1.0' in str(tag_map[305])
545+
assert 306 in tag_map
546+
assert '2025:01:15' in str(tag_map[306])
547+
548+
def test_no_extra_tags(self, tmp_path):
549+
"""Files with only managed tags have no extra_tags attr."""
550+
arr = np.ones((4, 4), dtype=np.float32)
551+
path = str(tmp_path / 'no_extra.tif')
552+
write(arr, path, compression='none', tiled=False)
553+
554+
da = read_geotiff(path)
555+
assert 'extra_tags' not in da.attrs
556+
557+
420558
class TestGDALMetadata:
421559

422560
def test_parse_gdal_metadata_xml(self):

0 commit comments

Comments
 (0)