Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .claude/sweep-metadata-state.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
module,last_inspected,issue,severity_max,categories_found,notes
geotiff,2026-05-11,1616,HIGH,4,_vrt.read_vrt leaked integer source nodata sentinel through int->float cast when VRT declared a float dataType; downstream NaN-aware code saw sentinel as valid data (#1616).
geotiff,2026-05-11,1632,HIGH,1;5,extract_geo_info dropped user-defined CRS WKT: GeoKey *CSTypeGeoKey=32767 files emitted attrs[crs_name] only (no crs/crs_wkt) so to_geotiff silently lost the projection on read->write. Fixed by promoting WKT-looking citation to crs_wkt across all four read backends (#1632).
reproject,2026-05-10,1572;1573,HIGH,1;3;4,geoid_height_raster dropped input attrs and used dims[-2:] for 3D inputs (#1572). reproject/merge ignored nodatavals (rasterio convention) when rioxarray absent (#1573). Fixed in same branch.
49 changes: 49 additions & 0 deletions xrspatial/geotiff/_geotags.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,46 @@ def _epsg_to_wkt(epsg: int) -> str | None:
return None


# Top-level WKT 1 / WKT 2 keywords. GDAL stores user-defined CRSes
# (GeoKey *CSTypeGeoKey == 32767) as WKT in the citation, so callers
# of ``extract_geo_info`` need a cheap test to decide whether the
# citation string round-trips as a CRS or is just a free-form name.
# Listed in rough order of how often they appear in real-world files.
_WKT_PREFIXES = (
'PROJCS[',
'GEOGCS[',
'PROJCRS[',
'GEOGCRS[',
'COMPD_CS[',
'COMPOUNDCRS[',
'BOUNDCRS[',
'VERT_CS[',
'VERTCRS[',
'LOCAL_CS[',
'ENGCRS[',
'PARAMETRICCRS[',
'TIMECRS[',
'DERIVEDPROJCRS[',
)


def _looks_like_wkt(text) -> bool:
"""Return True iff ``text`` opens with a known WKT root keyword.

The check is intentionally surface-level: only the first
non-whitespace stretch is inspected, no parser is invoked. The
callers only need to distinguish "this citation is actually a WKT
string GDAL stuffed in here" from "this is a human-readable CRS
name" -- a deeper parse would be both slower and a different bug
surface. False positives on names that happen to start with a WKT
keyword followed by '[' are vanishingly rare in practice.
"""
if not isinstance(text, str):
return False
head = text.lstrip().upper()
return any(head.startswith(p) for p in _WKT_PREFIXES)


def _parse_geokeys(ifd: IFD, data: bytes | memoryview,
byte_order: str) -> dict[int, int | float | str]:
"""Parse the GeoKeyDirectory and resolve values from param tags.
Expand Down Expand Up @@ -586,6 +626,15 @@ def extract_geo_info(ifd: IFD, data: bytes | memoryview,
crs_wkt = None
if epsg is not None:
crs_wkt = _epsg_to_wkt(epsg)
elif _looks_like_wkt(crs_name):
# User-defined CRS: GeoKey GEOKEY_*_CS_TYPE == 32767 and the WKT
# lives in the citation. Expose it as crs_wkt so the writer can
# round-trip it. The citation itself stays in crs_name for callers
# that expect that key. Without this branch a read -> write cycle
# silently drops the projection on user-defined CRS files because
# to_geotiff only consults attrs['crs'] / attrs['crs_wkt']. See
# issue #1632.
crs_wkt = crs_name

return GeoInfo(
transform=transform,
Expand Down
241 changes: 241 additions & 0 deletions xrspatial/geotiff/tests/test_user_defined_crs_wkt_1632.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
"""Regression tests for issue #1632.

Files with a user-defined CRS (no EPSG, WKT stored in the GeoTIFF
citation under ``GEOKEY_*_CS_TYPE == 32767``) used to round-trip with
``attrs['crs_name']`` set but ``attrs['crs_wkt']`` and ``attrs['crs']``
unset. ``to_geotiff`` only consults the latter two, so a read -> write
cycle silently dropped the projection.

The fix promotes the citation to ``attrs['crs_wkt']`` whenever no EPSG
is resolved and the citation parses as WKT (starts with one of the
known WKT 1 / WKT 2 root keywords). ``crs_name`` stays populated for
back-compat. Tests pin the contract across all four read backends and
across the read -> write -> read round trip.
"""
from __future__ import annotations

import importlib.util

import numpy as np
import pytest
import xarray as xr

import tifffile
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in eb600ba: switched to pytest.importorskip


from xrspatial.geotiff import open_geotiff, to_geotiff
from xrspatial.geotiff._geotags import _looks_like_wkt


# A user-defined Lambert Conformal Conic that pyproj cannot identify
# as a registered EPSG. Trimmed to keep test fixtures readable.
_USER_DEFINED_WKT = (
'PROJCS["User defined LCC",'
'GEOGCS["NAD83",'
'DATUM["North American Datum 1983",'
'SPHEROID["GRS 1980",6378137,298.257222101]],'
'PRIMEM["Greenwich",0],'
'UNIT["degree",0.0174532925199433]],'
'PROJECTION["Lambert Conformal Conic 2SP"],'
'PARAMETER["central_meridian",-100],'
'PARAMETER["latitude_of_origin",40],'
'PARAMETER["standard_parallel_1",30],'
'PARAMETER["standard_parallel_2",50],'
'UNIT["metre",1]]'
)


def _gpu_available() -> bool:
if importlib.util.find_spec("cupy") is None:
return False
try:
import cupy
return bool(cupy.cuda.is_available())
except Exception:
return False


_HAS_GPU = _gpu_available()
_gpu_only = pytest.mark.skipif(not _HAS_GPU, reason="cupy + CUDA required")


def _write_user_defined_crs_tif(path):
"""Write a tiny GeoTIFF with WKT-only CRS and return the DataArray written."""
arr = np.ones((4, 4), dtype=np.float32)
da = xr.DataArray(
arr, dims=['y', 'x'],
coords={'y': np.linspace(50.0, 47.0, 4),
'x': np.linspace(10.0, 13.0, 4)},
attrs={'crs_wkt': _USER_DEFINED_WKT},
)
to_geotiff(da, path, compression='none')
return da


# ---------------------------------------------------------------------------
# _looks_like_wkt unit tests
# ---------------------------------------------------------------------------


@pytest.mark.parametrize("text", [
'PROJCS["foo"]',
'GEOGCS["foo"]',
'PROJCRS["foo"]',
'GEOGCRS["foo"]',
'COMPD_CS["foo"]',
'COMPOUNDCRS["foo"]',
'BOUNDCRS["foo"]',
'VERT_CS["foo"]',
'VERTCRS["foo"]',
'LOCAL_CS["foo"]',
' PROJCS["leading whitespace"]',
'projcs["lowercase"]',
])
def test_looks_like_wkt_positive(text):
"""Top-level WKT 1 / WKT 2 keywords parse as WKT."""
assert _looks_like_wkt(text)


def test_looks_like_wkt_requires_bracket():
"""A keyword without the opening bracket is not WKT."""
# "PROJCS" alone is a token, not a complete WKT element. The check
# demands the bracket so plain-text references to WKT keywords in
# human-readable names do not collide with the WKT path.
assert not _looks_like_wkt("PROJCS")
assert not _looks_like_wkt("PROJCS no bracket here")


@pytest.mark.parametrize("text", [
None,
'',
'NAD83 / UTM Zone 12N', # human-readable name, not WKT
'epsg:4326', # urn-like
'WGS 84',
'Some random string',
'PROJ string +proj=longlat +datum=WGS84',
42, # non-string input
b'PROJCS["bytes input"]', # bytes, not str
])
def test_looks_like_wkt_negative(text):
"""Non-WKT inputs return False (including non-string types)."""
assert not _looks_like_wkt(text)


# ---------------------------------------------------------------------------
# Read-side: backends emit crs_wkt for user-defined CRS files
# ---------------------------------------------------------------------------


def test_eager_emits_crs_wkt_for_user_defined_crs(tmp_path):
"""The eager numpy read populates attrs['crs_wkt'] when the file's
citation carries WKT, even without an EPSG."""
p = str(tmp_path / "user_defined_crs.tif")
_write_user_defined_crs_tif(p)

rd = open_geotiff(p)
assert rd.attrs.get("crs") is None # no EPSG
assert rd.attrs.get("crs_wkt") is not None
assert rd.attrs["crs_wkt"].startswith("PROJCS[")
# crs_name is kept for back-compat
assert rd.attrs.get("crs_name") == rd.attrs["crs_wkt"]


def test_dask_emits_crs_wkt_for_user_defined_crs(tmp_path):
"""The dask read path emits the same crs_wkt as numpy."""
p = str(tmp_path / "user_defined_crs_dask.tif")
_write_user_defined_crs_tif(p)

rd = open_geotiff(p, chunks=4)
assert rd.attrs.get("crs_wkt") is not None
assert rd.attrs["crs_wkt"].startswith("PROJCS[")


@_gpu_only
def test_cupy_emits_crs_wkt_for_user_defined_crs(tmp_path):
"""The cupy / GPU read path emits the same crs_wkt as numpy."""
p = str(tmp_path / "user_defined_crs_gpu.tif")
_write_user_defined_crs_tif(p)

rd = open_geotiff(p, gpu=True)
assert rd.attrs.get("crs_wkt") is not None
assert rd.attrs["crs_wkt"].startswith("PROJCS[")


@_gpu_only
def test_dask_cupy_emits_crs_wkt_for_user_defined_crs(tmp_path):
"""The dask+cupy read path emits the same crs_wkt as numpy."""
p = str(tmp_path / "user_defined_crs_dask_gpu.tif")
_write_user_defined_crs_tif(p)

rd = open_geotiff(p, gpu=True, chunks=4)
assert rd.attrs.get("crs_wkt") is not None
assert rd.attrs["crs_wkt"].startswith("PROJCS[")


# ---------------------------------------------------------------------------
# Read -> write -> read round trip: WKT survives the second write
# ---------------------------------------------------------------------------


def test_user_defined_crs_round_trips_through_to_geotiff(tmp_path):
"""A read -> write of a user-defined CRS file keeps the projection.

Pre-fix, ``to_geotiff(open_geotiff(src), dst)`` produced ``dst`` with
no GeoKey CRS entries and no GeoAsciiParams tag because the read path
only set ``attrs['crs_name']`` and the writer never consults that.
"""
src = str(tmp_path / "round_trip_src.tif")
_write_user_defined_crs_tif(src)

rd = open_geotiff(src)
dst = str(tmp_path / "round_trip_dst.tif")
to_geotiff(rd, dst, compression='none')

# The second file should carry the same WKT in its citation.
rd2 = open_geotiff(dst)
assert rd2.attrs.get("crs_wkt") == rd.attrs.get("crs_wkt")

# And the raw GeoKey + ASCII tags must be present.
with tifffile.TiffFile(dst) as tif:
keys = tif.pages[0].tags.get(34735) # GeoKeyDirectory
ascii_tag = tif.pages[0].tags.get(34737) # GeoAsciiParams
assert keys is not None
# GeoKeyDirectory header is 4 entries; a real CRS adds 3+ key
# entries (model type, raster type, GTCitation -> ascii ref).
assert len(keys.value) > 4
assert ascii_tag is not None
assert "PROJCS[" in ascii_tag.value


def test_epsg_crs_unchanged_by_fix(tmp_path):
"""The fix must not regress the EPSG path: files with attrs['crs'] = <int>
should still emit both crs and crs_wkt on read."""
arr = np.ones((4, 4), dtype=np.float32)
da = xr.DataArray(
arr, dims=['y', 'x'],
coords={'y': np.linspace(50.0, 47.0, 4),
'x': np.linspace(10.0, 13.0, 4)},
attrs={'crs': 4326},
)
p = str(tmp_path / "epsg.tif")
to_geotiff(da, p, compression='none')

rd = open_geotiff(p)
assert rd.attrs.get("crs") == 4326
assert rd.attrs.get("crs_wkt") is not None
# The WKT here is pyproj's canonical 4326 WKT; the citation is the
# short EPSG-style name "WGS 84", not WKT, so crs_name should not
# be promoted to crs_wkt.
assert rd.attrs.get("crs_name") != rd.attrs.get("crs_wkt")


def test_human_readable_crs_name_not_promoted_to_crs_wkt(tmp_path):
"""A citation that is a human-readable name (not WKT) must stay in
crs_name only. The _looks_like_wkt gate prevents accidental promotion."""
# tifffile-built file with citation 'NAD83 / UTM Zone 12N' as the
# citation, no EPSG. We can't easily build the GeoKey table from
# scratch here without recapitulating extract_geo_info; instead we
# exercise the path via the helper directly.
assert not _looks_like_wkt("NAD83 / UTM Zone 12N")
assert not _looks_like_wkt("WGS 84")
assert not _looks_like_wkt("")
assert not _looks_like_wkt(None)
Loading