Skip to content

Commit dbb5ceb

Browse files
authored
Promote user-defined CRS WKT to attrs['crs_wkt'] on read (#1632) (#1635)
* Promote user-defined CRS WKT to attrs['crs_wkt'] on read (#1632) GeoTIFFs with GeoKey *CSTypeGeoKey == 32767 store their CRS WKT in the GeoTIFF citation rather than referencing an EPSG code. extract_geo_info parsed the citation into geo_info.crs_name but left geo_info.crs_wkt unset, so _populate_attrs_from_geo_info emitted attrs['crs_name'] only. to_geotiff consults attrs['crs'] / attrs['crs_wkt'] but not 'crs_name', so a read -> write round-trip silently dropped the projection on user-defined CRS files. All four read backends were affected. The fix promotes the citation to geo_info.crs_wkt when the citation parses as WKT (top-level WKT 1 / WKT 2 root keywords: PROJCS[, GEOGCS[, PROJCRS[, GEOGCRS[, COMPD_CS[, COMPOUNDCRS[, BOUNDCRS[, VERT_CS[, VERTCRS[, LOCAL_CS[, ENGCRS, PARAMETRICCRS, TIMECRS, DERIVEDPROJCRS). crs_name stays set for back-compat. Citations that are plain names (e.g. 'NAD83 / UTM Zone 12N') are unchanged because _looks_like_wkt rejects them. * Guard tifffile import with pytest.importorskip on #1635 Matches the convention used elsewhere in xrspatial/geotiff/tests/. Test module no longer hard-fails collection on machines without tifffile installed.
1 parent caab40b commit dbb5ceb

3 files changed

Lines changed: 291 additions & 1 deletion

File tree

.claude/sweep-metadata-state.csv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
module,last_inspected,issue,severity_max,categories_found,notes
2-
geotiff,2026-05-11,1616,HIGH,4,_vrt.read_vrt leaked integer source nodata sentinel through int->float cast when VRT declared a float dataType; downstream NaN-aware code saw sentinel as valid data (#1616).
2+
geotiff,2026-05-11,1632,HIGH,1;5,extract_geo_info dropped user-defined CRS WKT: GeoKey *CSTypeGeoKey=32767 files emitted attrs[crs_name] only (no crs/crs_wkt) so to_geotiff silently lost the projection on read->write. Fixed by promoting WKT-looking citation to crs_wkt across all four read backends (#1632).
33
reproject,2026-05-10,1572;1573,HIGH,1;3;4,geoid_height_raster dropped input attrs and used dims[-2:] for 3D inputs (#1572). reproject/merge ignored nodatavals (rasterio convention) when rioxarray absent (#1573). Fixed in same branch.

xrspatial/geotiff/_geotags.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,46 @@ def _epsg_to_wkt(epsg: int) -> str | None:
243243
return None
244244

245245

246+
# Top-level WKT 1 / WKT 2 keywords. GDAL stores user-defined CRSes
247+
# (GeoKey *CSTypeGeoKey == 32767) as WKT in the citation, so callers
248+
# of ``extract_geo_info`` need a cheap test to decide whether the
249+
# citation string round-trips as a CRS or is just a free-form name.
250+
# Listed in rough order of how often they appear in real-world files.
251+
_WKT_PREFIXES = (
252+
'PROJCS[',
253+
'GEOGCS[',
254+
'PROJCRS[',
255+
'GEOGCRS[',
256+
'COMPD_CS[',
257+
'COMPOUNDCRS[',
258+
'BOUNDCRS[',
259+
'VERT_CS[',
260+
'VERTCRS[',
261+
'LOCAL_CS[',
262+
'ENGCRS[',
263+
'PARAMETRICCRS[',
264+
'TIMECRS[',
265+
'DERIVEDPROJCRS[',
266+
)
267+
268+
269+
def _looks_like_wkt(text) -> bool:
270+
"""Return True iff ``text`` opens with a known WKT root keyword.
271+
272+
The check is intentionally surface-level: only the first
273+
non-whitespace stretch is inspected, no parser is invoked. The
274+
callers only need to distinguish "this citation is actually a WKT
275+
string GDAL stuffed in here" from "this is a human-readable CRS
276+
name" -- a deeper parse would be both slower and a different bug
277+
surface. False positives on names that happen to start with a WKT
278+
keyword followed by '[' are vanishingly rare in practice.
279+
"""
280+
if not isinstance(text, str):
281+
return False
282+
head = text.lstrip().upper()
283+
return any(head.startswith(p) for p in _WKT_PREFIXES)
284+
285+
246286
def _parse_geokeys(ifd: IFD, data: bytes | memoryview,
247287
byte_order: str) -> dict[int, int | float | str]:
248288
"""Parse the GeoKeyDirectory and resolve values from param tags.
@@ -586,6 +626,15 @@ def extract_geo_info(ifd: IFD, data: bytes | memoryview,
586626
crs_wkt = None
587627
if epsg is not None:
588628
crs_wkt = _epsg_to_wkt(epsg)
629+
elif _looks_like_wkt(crs_name):
630+
# User-defined CRS: GeoKey GEOKEY_*_CS_TYPE == 32767 and the WKT
631+
# lives in the citation. Expose it as crs_wkt so the writer can
632+
# round-trip it. The citation itself stays in crs_name for callers
633+
# that expect that key. Without this branch a read -> write cycle
634+
# silently drops the projection on user-defined CRS files because
635+
# to_geotiff only consults attrs['crs'] / attrs['crs_wkt']. See
636+
# issue #1632.
637+
crs_wkt = crs_name
589638

590639
return GeoInfo(
591640
transform=transform,
Lines changed: 241 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,241 @@
1+
"""Regression tests for issue #1632.
2+
3+
Files with a user-defined CRS (no EPSG, WKT stored in the GeoTIFF
4+
citation under ``GEOKEY_*_CS_TYPE == 32767``) used to round-trip with
5+
``attrs['crs_name']`` set but ``attrs['crs_wkt']`` and ``attrs['crs']``
6+
unset. ``to_geotiff`` only consults the latter two, so a read -> write
7+
cycle silently dropped the projection.
8+
9+
The fix promotes the citation to ``attrs['crs_wkt']`` whenever no EPSG
10+
is resolved and the citation parses as WKT (starts with one of the
11+
known WKT 1 / WKT 2 root keywords). ``crs_name`` stays populated for
12+
back-compat. Tests pin the contract across all four read backends and
13+
across the read -> write -> read round trip.
14+
"""
15+
from __future__ import annotations
16+
17+
import importlib.util
18+
19+
import numpy as np
20+
import pytest
21+
import xarray as xr
22+
23+
tifffile = pytest.importorskip("tifffile")
24+
25+
from xrspatial.geotiff import open_geotiff, to_geotiff
26+
from xrspatial.geotiff._geotags import _looks_like_wkt
27+
28+
29+
# A user-defined Lambert Conformal Conic that pyproj cannot identify
30+
# as a registered EPSG. Trimmed to keep test fixtures readable.
31+
_USER_DEFINED_WKT = (
32+
'PROJCS["User defined LCC",'
33+
'GEOGCS["NAD83",'
34+
'DATUM["North American Datum 1983",'
35+
'SPHEROID["GRS 1980",6378137,298.257222101]],'
36+
'PRIMEM["Greenwich",0],'
37+
'UNIT["degree",0.0174532925199433]],'
38+
'PROJECTION["Lambert Conformal Conic 2SP"],'
39+
'PARAMETER["central_meridian",-100],'
40+
'PARAMETER["latitude_of_origin",40],'
41+
'PARAMETER["standard_parallel_1",30],'
42+
'PARAMETER["standard_parallel_2",50],'
43+
'UNIT["metre",1]]'
44+
)
45+
46+
47+
def _gpu_available() -> bool:
48+
if importlib.util.find_spec("cupy") is None:
49+
return False
50+
try:
51+
import cupy
52+
return bool(cupy.cuda.is_available())
53+
except Exception:
54+
return False
55+
56+
57+
_HAS_GPU = _gpu_available()
58+
_gpu_only = pytest.mark.skipif(not _HAS_GPU, reason="cupy + CUDA required")
59+
60+
61+
def _write_user_defined_crs_tif(path):
62+
"""Write a tiny GeoTIFF with WKT-only CRS and return the DataArray written."""
63+
arr = np.ones((4, 4), dtype=np.float32)
64+
da = xr.DataArray(
65+
arr, dims=['y', 'x'],
66+
coords={'y': np.linspace(50.0, 47.0, 4),
67+
'x': np.linspace(10.0, 13.0, 4)},
68+
attrs={'crs_wkt': _USER_DEFINED_WKT},
69+
)
70+
to_geotiff(da, path, compression='none')
71+
return da
72+
73+
74+
# ---------------------------------------------------------------------------
75+
# _looks_like_wkt unit tests
76+
# ---------------------------------------------------------------------------
77+
78+
79+
@pytest.mark.parametrize("text", [
80+
'PROJCS["foo"]',
81+
'GEOGCS["foo"]',
82+
'PROJCRS["foo"]',
83+
'GEOGCRS["foo"]',
84+
'COMPD_CS["foo"]',
85+
'COMPOUNDCRS["foo"]',
86+
'BOUNDCRS["foo"]',
87+
'VERT_CS["foo"]',
88+
'VERTCRS["foo"]',
89+
'LOCAL_CS["foo"]',
90+
' PROJCS["leading whitespace"]',
91+
'projcs["lowercase"]',
92+
])
93+
def test_looks_like_wkt_positive(text):
94+
"""Top-level WKT 1 / WKT 2 keywords parse as WKT."""
95+
assert _looks_like_wkt(text)
96+
97+
98+
def test_looks_like_wkt_requires_bracket():
99+
"""A keyword without the opening bracket is not WKT."""
100+
# "PROJCS" alone is a token, not a complete WKT element. The check
101+
# demands the bracket so plain-text references to WKT keywords in
102+
# human-readable names do not collide with the WKT path.
103+
assert not _looks_like_wkt("PROJCS")
104+
assert not _looks_like_wkt("PROJCS no bracket here")
105+
106+
107+
@pytest.mark.parametrize("text", [
108+
None,
109+
'',
110+
'NAD83 / UTM Zone 12N', # human-readable name, not WKT
111+
'epsg:4326', # urn-like
112+
'WGS 84',
113+
'Some random string',
114+
'PROJ string +proj=longlat +datum=WGS84',
115+
42, # non-string input
116+
b'PROJCS["bytes input"]', # bytes, not str
117+
])
118+
def test_looks_like_wkt_negative(text):
119+
"""Non-WKT inputs return False (including non-string types)."""
120+
assert not _looks_like_wkt(text)
121+
122+
123+
# ---------------------------------------------------------------------------
124+
# Read-side: backends emit crs_wkt for user-defined CRS files
125+
# ---------------------------------------------------------------------------
126+
127+
128+
def test_eager_emits_crs_wkt_for_user_defined_crs(tmp_path):
129+
"""The eager numpy read populates attrs['crs_wkt'] when the file's
130+
citation carries WKT, even without an EPSG."""
131+
p = str(tmp_path / "user_defined_crs.tif")
132+
_write_user_defined_crs_tif(p)
133+
134+
rd = open_geotiff(p)
135+
assert rd.attrs.get("crs") is None # no EPSG
136+
assert rd.attrs.get("crs_wkt") is not None
137+
assert rd.attrs["crs_wkt"].startswith("PROJCS[")
138+
# crs_name is kept for back-compat
139+
assert rd.attrs.get("crs_name") == rd.attrs["crs_wkt"]
140+
141+
142+
def test_dask_emits_crs_wkt_for_user_defined_crs(tmp_path):
143+
"""The dask read path emits the same crs_wkt as numpy."""
144+
p = str(tmp_path / "user_defined_crs_dask.tif")
145+
_write_user_defined_crs_tif(p)
146+
147+
rd = open_geotiff(p, chunks=4)
148+
assert rd.attrs.get("crs_wkt") is not None
149+
assert rd.attrs["crs_wkt"].startswith("PROJCS[")
150+
151+
152+
@_gpu_only
153+
def test_cupy_emits_crs_wkt_for_user_defined_crs(tmp_path):
154+
"""The cupy / GPU read path emits the same crs_wkt as numpy."""
155+
p = str(tmp_path / "user_defined_crs_gpu.tif")
156+
_write_user_defined_crs_tif(p)
157+
158+
rd = open_geotiff(p, gpu=True)
159+
assert rd.attrs.get("crs_wkt") is not None
160+
assert rd.attrs["crs_wkt"].startswith("PROJCS[")
161+
162+
163+
@_gpu_only
164+
def test_dask_cupy_emits_crs_wkt_for_user_defined_crs(tmp_path):
165+
"""The dask+cupy read path emits the same crs_wkt as numpy."""
166+
p = str(tmp_path / "user_defined_crs_dask_gpu.tif")
167+
_write_user_defined_crs_tif(p)
168+
169+
rd = open_geotiff(p, gpu=True, chunks=4)
170+
assert rd.attrs.get("crs_wkt") is not None
171+
assert rd.attrs["crs_wkt"].startswith("PROJCS[")
172+
173+
174+
# ---------------------------------------------------------------------------
175+
# Read -> write -> read round trip: WKT survives the second write
176+
# ---------------------------------------------------------------------------
177+
178+
179+
def test_user_defined_crs_round_trips_through_to_geotiff(tmp_path):
180+
"""A read -> write of a user-defined CRS file keeps the projection.
181+
182+
Pre-fix, ``to_geotiff(open_geotiff(src), dst)`` produced ``dst`` with
183+
no GeoKey CRS entries and no GeoAsciiParams tag because the read path
184+
only set ``attrs['crs_name']`` and the writer never consults that.
185+
"""
186+
src = str(tmp_path / "round_trip_src.tif")
187+
_write_user_defined_crs_tif(src)
188+
189+
rd = open_geotiff(src)
190+
dst = str(tmp_path / "round_trip_dst.tif")
191+
to_geotiff(rd, dst, compression='none')
192+
193+
# The second file should carry the same WKT in its citation.
194+
rd2 = open_geotiff(dst)
195+
assert rd2.attrs.get("crs_wkt") == rd.attrs.get("crs_wkt")
196+
197+
# And the raw GeoKey + ASCII tags must be present.
198+
with tifffile.TiffFile(dst) as tif:
199+
keys = tif.pages[0].tags.get(34735) # GeoKeyDirectory
200+
ascii_tag = tif.pages[0].tags.get(34737) # GeoAsciiParams
201+
assert keys is not None
202+
# GeoKeyDirectory header is 4 entries; a real CRS adds 3+ key
203+
# entries (model type, raster type, GTCitation -> ascii ref).
204+
assert len(keys.value) > 4
205+
assert ascii_tag is not None
206+
assert "PROJCS[" in ascii_tag.value
207+
208+
209+
def test_epsg_crs_unchanged_by_fix(tmp_path):
210+
"""The fix must not regress the EPSG path: files with attrs['crs'] = <int>
211+
should still emit both crs and crs_wkt on read."""
212+
arr = np.ones((4, 4), dtype=np.float32)
213+
da = xr.DataArray(
214+
arr, dims=['y', 'x'],
215+
coords={'y': np.linspace(50.0, 47.0, 4),
216+
'x': np.linspace(10.0, 13.0, 4)},
217+
attrs={'crs': 4326},
218+
)
219+
p = str(tmp_path / "epsg.tif")
220+
to_geotiff(da, p, compression='none')
221+
222+
rd = open_geotiff(p)
223+
assert rd.attrs.get("crs") == 4326
224+
assert rd.attrs.get("crs_wkt") is not None
225+
# The WKT here is pyproj's canonical 4326 WKT; the citation is the
226+
# short EPSG-style name "WGS 84", not WKT, so crs_name should not
227+
# be promoted to crs_wkt.
228+
assert rd.attrs.get("crs_name") != rd.attrs.get("crs_wkt")
229+
230+
231+
def test_human_readable_crs_name_not_promoted_to_crs_wkt(tmp_path):
232+
"""A citation that is a human-readable name (not WKT) must stay in
233+
crs_name only. The _looks_like_wkt gate prevents accidental promotion."""
234+
# tifffile-built file with citation 'NAD83 / UTM Zone 12N' as the
235+
# citation, no EPSG. We can't easily build the GeoKey table from
236+
# scratch here without recapitulating extract_geo_info; instead we
237+
# exercise the path via the helper directly.
238+
assert not _looks_like_wkt("NAD83 / UTM Zone 12N")
239+
assert not _looks_like_wkt("WGS 84")
240+
assert not _looks_like_wkt("")
241+
assert not _looks_like_wkt(None)

0 commit comments

Comments
 (0)