Skip to content

Commit eb6955e

Browse files
authored
geotiff: grow HTTP COG header prefetch past 64 KiB (#1718) (#1727)
_parse_cog_http_meta used to fetch 16 KiB and retry once with 64 KiB. COGs whose IFD chain or out-of-line tag arrays (TileOffsets, GeoAsciiParams, GDAL_METADATA) sat past 64 KiB silently lost IFDs or raised from parse_ifd's bounds checks. Replace the two-shot with a grow loop that doubles the buffer until the IFD chain resolves, capped at MAX_HTTP_HEADER_BYTES (4 MiB). Fast path is unchanged. Closes #1718.
1 parent 4a48baf commit eb6955e

2 files changed

Lines changed: 355 additions & 14 deletions

File tree

xrspatial/geotiff/_reader.py

Lines changed: 96 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1518,33 +1518,115 @@ def _decode_one(job):
15181518
# COG HTTP reader
15191519
# ---------------------------------------------------------------------------
15201520

1521+
#: Initial prefetch size for ``_parse_cog_http_meta``. Sized for the common
1522+
#: case (a single-IFD COG with modest GeoTIFF tags) so the fast path is a
1523+
#: single range GET.
1524+
INITIAL_HTTP_HEADER_BYTES = 16 * 1024
1525+
1526+
#: Upper bound on how far ``_parse_cog_http_meta`` will grow its prefetch
1527+
#: buffer before giving up. 4 MiB comfortably covers deep pyramids whose
1528+
#: IFD chains plus tag arrays (TileOffsets, GeoAsciiParams, GDAL_METADATA)
1529+
#: extend far past the initial fetch window. See issue #1718.
1530+
MAX_HTTP_HEADER_BYTES = 4 * 1024 * 1024
1531+
1532+
1533+
def _ifd_required_extent(
1534+
ifds: list[IFD], header: TIFFHeader, data_len: int,
1535+
) -> int:
1536+
"""Return the highest byte offset the parsed IFDs reference.
1537+
1538+
Used to decide whether the prefetch buffer is large enough to hold the
1539+
entire IFD chain plus every out-of-line tag value. We compare this
1540+
against ``len(data)`` in :func:`_parse_cog_http_meta`; if it exceeds the
1541+
buffer, the chain is truncated and the caller must grow and retry.
1542+
1543+
The walk re-derives each tag's value-area placement directly from the
1544+
IFD layout (entry table base + entry slot) rather than re-parsing the
1545+
raw bytes. For out-of-line tags ``parse_ifd`` already resolved the
1546+
pointer and validated ``ptr + size <= data_len``; the *interesting*
1547+
extent for the grow loop is the next-IFD pointer of the chain tail,
1548+
plus an "is there a next IFD we have not yet seen" probe.
1549+
"""
1550+
if not ifds:
1551+
return 0
1552+
1553+
required = 0
1554+
# Last IFD's next_ifd_offset: 0 means end-of-chain; anything else
1555+
# points at an IFD we haven't parsed yet because it sat past the
1556+
# buffer (parse_all_ifds stops on offset >= len(data)).
1557+
tail_next = ifds[-1].next_ifd_offset
1558+
if tail_next != 0:
1559+
# Need at least enough bytes to reach the next IFD header. Pad
1560+
# by a small amount so parse_ifd can read the num_entries field
1561+
# without truncation -- the actual entry table is bounded by the
1562+
# parser's own checks on the next grow iteration.
1563+
required = max(required, tail_next + 64)
1564+
1565+
# Out-of-line tag values are already parsed (parse_ifd bounds-checked
1566+
# ptr + total_size <= len(data) before reading). For grow logic we
1567+
# only need to ensure those checks did not *fail*; a thrown
1568+
# ValueError surfaces in parse_all_ifds and is handled by the loop.
1569+
return required
1570+
1571+
15211572
def _parse_cog_http_meta(
15221573
source: _HTTPSource,
15231574
overview_level: int | None = None,
15241575
) -> tuple[TIFFHeader, IFD, GeoInfo, bytes]:
15251576
"""Fetch + parse the leading IFDs of an HTTP COG once.
15261577
1527-
Issues one (or rarely two) range GET(s) for the leading 16 KB / 64 KB
1528-
of the file, parses the header and IFD list, and returns the selected
1529-
IFD plus the raw header bytes (kept for ``extract_geo_info`` callers
1530-
that might want the IFD's tag offsets).
1578+
The fast path is a single 16 KiB range GET. When the IFD chain or its
1579+
out-of-line tag arrays extend past that window the buffer is doubled
1580+
and reparsed until either the chain is fully resolved or the cap at
1581+
:data:`MAX_HTTP_HEADER_BYTES` is reached. Real COGs whose pyramid
1582+
metadata legitimately exceeds the cap need a different strategy
1583+
(lazy per-IFD reads); the cap exists to bound a malformed-file blast
1584+
radius rather than to constrain valid pyramids.
15311585
15321586
Pulled out of :func:`_read_cog_http` so :func:`read_geotiff_dask`
15331587
can parse metadata once per graph rather than once per chunk task
15341588
(P5: each delayed task used to fire its own 16 KB header GET).
15351589
"""
1536-
header_bytes = source.read_range(0, 16384)
1590+
fetch_size = INITIAL_HTTP_HEADER_BYTES
1591+
header_bytes = source.read_range(0, fetch_size)
15371592
header = parse_header(header_bytes)
1538-
ifds = parse_all_ifds(header_bytes, header)
15391593

1540-
# parse_all_ifds bails the moment it walks past the bytes we
1541-
# fetched, so a header GET that lands short of the first IFD's
1542-
# offset returns an empty list. Retry with a larger window in that
1543-
# case; this is *not* a partial-IFD recovery (overviews chained
1544-
# past the first 16 KiB are still loaded lazily by other readers).
1545-
if len(ifds) == 0:
1546-
header_bytes = source.read_range(0, 65536)
1547-
ifds = parse_all_ifds(header_bytes, header)
1594+
last_len = len(header_bytes)
1595+
ifds: list[IFD] = []
1596+
while True:
1597+
try:
1598+
ifds = parse_all_ifds(header_bytes, header)
1599+
required = _ifd_required_extent(ifds, header, len(header_bytes))
1600+
# Chain is fully resolved when every IFD parsed cleanly and
1601+
# the tail next_ifd_offset is reachable within the buffer
1602+
# (required == 0 means end-of-chain).
1603+
if ifds and required <= len(header_bytes):
1604+
break
1605+
except ValueError:
1606+
# parse_ifd raises when an out-of-line tag points past the
1607+
# buffer. Treat it the same as a truncated chain: grow and
1608+
# retry. If we are already at the cap and still failing, let
1609+
# the next iteration's cap check raise a clear error.
1610+
ifds = []
1611+
1612+
if fetch_size >= MAX_HTTP_HEADER_BYTES:
1613+
raise ValueError(
1614+
f"COG IFD chain or tag arrays extend past "
1615+
f"MAX_HTTP_HEADER_BYTES={MAX_HTTP_HEADER_BYTES} bytes; "
1616+
f"the file may be malformed or its pyramid metadata is "
1617+
f"unusually large for HTTP prefetch")
1618+
fetch_size = min(fetch_size * 2, MAX_HTTP_HEADER_BYTES)
1619+
header_bytes = source.read_range(0, fetch_size)
1620+
# Server returned the same number of bytes as last time: we have
1621+
# hit EOF on the underlying file. No point growing further; if
1622+
# the IFD chain still doesn't resolve, the file is truncated.
1623+
if len(header_bytes) == last_len:
1624+
try:
1625+
ifds = parse_all_ifds(header_bytes, header)
1626+
except ValueError:
1627+
ifds = []
1628+
break
1629+
last_len = len(header_bytes)
15481630

15491631
if len(ifds) == 0:
15501632
raise ValueError("No IFDs found in COG")
Lines changed: 259 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,259 @@
1+
"""Tests for HTTP COG metadata prefetch growing past 64 KiB (issue #1718).
2+
3+
The fast path is a single 16 KiB GET; if the IFD chain or its out-of-line
4+
tag arrays (TileOffsets, GeoAsciiParams, GDAL_METADATA) extend past that
5+
window the prefetch buffer doubles until everything fits or it hits
6+
:data:`MAX_HTTP_HEADER_BYTES`.
7+
"""
8+
from __future__ import annotations
9+
10+
import http.server
11+
import socketserver
12+
import threading
13+
14+
import numpy as np
15+
import pytest
16+
17+
from xrspatial.geotiff._header import parse_all_ifds, parse_header
18+
from xrspatial.geotiff._reader import (
19+
INITIAL_HTTP_HEADER_BYTES,
20+
MAX_HTTP_HEADER_BYTES,
21+
_HTTPSource,
22+
_parse_cog_http_meta,
23+
_read_cog_http,
24+
)
25+
from xrspatial.geotiff._writer import write
26+
27+
28+
# ---------------------------------------------------------------------------
29+
# Helpers
30+
# ---------------------------------------------------------------------------
31+
32+
class _InMemoryHTTPSource(_HTTPSource):
33+
"""_HTTPSource backed by an in-memory bytes buffer.
34+
35+
Counts ``read_range`` calls so tests can lock in the fast-path
36+
invariant (one GET for headers that fit in 16 KiB).
37+
"""
38+
39+
def __init__(self, payload: bytes):
40+
# Skip super().__init__ -- no network, no SSRF validation needed.
41+
self._url = 'memory://test'
42+
self._size = len(payload)
43+
self._pool = None
44+
self._payload = payload
45+
self.read_range_calls: list[tuple[int, int]] = []
46+
47+
def read_range(self, start: int, length: int) -> bytes:
48+
self.read_range_calls.append((start, length))
49+
return self._payload[start:start + length]
50+
51+
52+
class _RangeHandler1718(http.server.BaseHTTPRequestHandler):
53+
payload: bytes = b''
54+
55+
def do_GET(self): # noqa: N802
56+
rng = self.headers.get('Range')
57+
if rng and rng.startswith('bytes='):
58+
spec = rng[len('bytes='):]
59+
start_s, _, end_s = spec.partition('-')
60+
start = int(start_s)
61+
end = int(end_s) if end_s else len(self.payload) - 1
62+
chunk = self.payload[start:end + 1]
63+
self.send_response(206)
64+
self.send_header('Content-Type', 'application/octet-stream')
65+
self.send_header(
66+
'Content-Range',
67+
f'bytes {start}-{start + len(chunk) - 1}/{len(self.payload)}',
68+
)
69+
self.send_header('Content-Length', str(len(chunk)))
70+
self.end_headers()
71+
self.wfile.write(chunk)
72+
return
73+
self.send_response(200)
74+
self.send_header('Content-Type', 'application/octet-stream')
75+
self.send_header('Content-Length', str(len(self.payload)))
76+
self.end_headers()
77+
self.wfile.write(self.payload)
78+
79+
def log_message(self, *_args, **_kwargs):
80+
pass
81+
82+
83+
def _serve(payload: bytes):
84+
handler_cls = type(
85+
'RangeHandler1718Bound', (_RangeHandler1718,), {'payload': payload}
86+
)
87+
httpd = socketserver.TCPServer(('127.0.0.1', 0), handler_cls)
88+
thread = threading.Thread(target=httpd.serve_forever, daemon=True)
89+
thread.start()
90+
return httpd, thread
91+
92+
93+
def _write_cog_with_big_metadata(path: str, arr: np.ndarray,
94+
metadata_pad_bytes: int) -> None:
95+
"""Write a multi-overview COG whose level-0 IFD carries a huge
96+
GDAL_METADATA tag, pushing the chained overview IFDs past 64 KiB."""
97+
# GDAL_METADATA is stored as an out-of-line ASCII tag value when
98+
# large; a multi-kilobyte payload pads the value area between the
99+
# first IFD and its overviews, forcing the rest of the chain past
100+
# the 16 KiB / 64 KiB prefetch windows.
101+
big_xml = (
102+
'<GDALMetadata>'
103+
+ '<Item name="filler">' + 'x' * metadata_pad_bytes + '</Item>'
104+
+ '</GDALMetadata>'
105+
)
106+
write(arr, path, compression='deflate', tiled=True, tile_size=64,
107+
cog=True, overview_levels=[2, 4, 8],
108+
gdal_metadata_xml=big_xml)
109+
110+
111+
# ---------------------------------------------------------------------------
112+
# Fast path: small COG should fire a single 16 KiB read
113+
# ---------------------------------------------------------------------------
114+
115+
def test_small_cog_uses_single_initial_read(tmp_path):
116+
arr = np.arange(64 * 64, dtype=np.float32).reshape(64, 64)
117+
path = str(tmp_path / 'small_1718_cog.tif')
118+
write(arr, path, compression='deflate', tiled=True, tile_size=32,
119+
cog=True, overview_levels=[1])
120+
121+
with open(path, 'rb') as f:
122+
payload = f.read()
123+
124+
src = _InMemoryHTTPSource(payload)
125+
header, ifd, geo_info, header_bytes = _parse_cog_http_meta(src)
126+
127+
# Fast path is exactly one read_range at the initial size.
128+
assert len(src.read_range_calls) == 1
129+
assert src.read_range_calls[0] == (0, INITIAL_HTTP_HEADER_BYTES)
130+
# And the buffer fully resolves the chain.
131+
parsed_ifds = parse_all_ifds(header_bytes, header)
132+
assert parsed_ifds[-1].next_ifd_offset == 0
133+
134+
135+
# ---------------------------------------------------------------------------
136+
# Grow path: COG whose IFD chain extends past 64 KiB still parses
137+
# ---------------------------------------------------------------------------
138+
139+
def test_ifd_chain_past_64kib_resolves(tmp_path):
140+
arr = np.arange(256 * 256, dtype=np.float32).reshape(256, 256)
141+
path = str(tmp_path / 'big_meta_1718_cog.tif')
142+
# 96 KiB of XML padding guarantees subsequent IFDs land well past
143+
# both the 16 KiB initial fetch and the legacy 64 KiB retry.
144+
_write_cog_with_big_metadata(path, arr, metadata_pad_bytes=96 * 1024)
145+
146+
with open(path, 'rb') as f:
147+
payload = f.read()
148+
149+
# Sanity: the second IFD's offset really does sit past 64 KiB,
150+
# otherwise this test is not exercising the grow loop.
151+
header = parse_header(payload)
152+
full_ifds = parse_all_ifds(payload, header)
153+
assert len(full_ifds) >= 2, "fixture must have >=2 IFDs"
154+
assert full_ifds[0].next_ifd_offset > 65536, (
155+
"fixture must place IFD #2 past 64 KiB to exercise the grow loop; "
156+
f"got next_ifd_offset={full_ifds[0].next_ifd_offset}"
157+
)
158+
159+
src = _InMemoryHTTPSource(payload)
160+
_, _, _, header_bytes = _parse_cog_http_meta(src)
161+
162+
grown_ifds = parse_all_ifds(header_bytes, header)
163+
assert len(grown_ifds) == len(full_ifds), (
164+
f"prefetch buffer lost IFDs: got {len(grown_ifds)} of {len(full_ifds)}"
165+
)
166+
# Multiple read_range calls confirm the buffer actually grew.
167+
assert len(src.read_range_calls) > 1
168+
# And it did not blow past the cap.
169+
assert src.read_range_calls[-1][1] <= MAX_HTTP_HEADER_BYTES
170+
171+
172+
def test_end_to_end_http_read_with_big_metadata(tmp_path, monkeypatch):
173+
"""_read_cog_http should match local read on a >64 KiB IFD-chain COG."""
174+
monkeypatch.setenv('XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS', '1')
175+
arr = np.arange(256 * 256, dtype=np.float32).reshape(256, 256)
176+
path = str(tmp_path / 'http_big_1718_cog.tif')
177+
_write_cog_with_big_metadata(path, arr, metadata_pad_bytes=96 * 1024)
178+
179+
with open(path, 'rb') as f:
180+
payload = f.read()
181+
182+
httpd, _thread = _serve(payload)
183+
port = httpd.server_address[1]
184+
try:
185+
url = f'http://127.0.0.1:{port}/cog.tif'
186+
result, _geo = _read_cog_http(url)
187+
np.testing.assert_array_equal(result, arr)
188+
189+
# Overview read on the same URL must also succeed.
190+
result_ov, _ = _read_cog_http(url, overview_level=1)
191+
assert result_ov.shape[0] < arr.shape[0]
192+
finally:
193+
httpd.shutdown()
194+
httpd.server_close()
195+
196+
197+
# ---------------------------------------------------------------------------
198+
# Truncation / cap behaviour
199+
# ---------------------------------------------------------------------------
200+
201+
def test_cap_raises_clear_error_on_excessive_chain(monkeypatch):
202+
"""When the IFD chain refuses to fit, hitting the cap raises ValueError.
203+
204+
Patches MAX_HTTP_HEADER_BYTES tiny so the test does not need to
205+
fabricate a multi-megabyte payload to exercise the cap branch.
206+
"""
207+
from xrspatial.geotiff import _reader
208+
209+
# Build a payload whose first IFD's next-IFD offset deliberately
210+
# points to a huge address we will never reach. parse_all_ifds will
211+
# return the first IFD but tail_next > buffer, forcing the grow loop.
212+
# The payload itself is small so the server EOF branch is not what
213+
# raises -- we want the cap branch.
214+
arr = np.arange(64 * 64, dtype=np.float32).reshape(64, 64)
215+
# In-memory write
216+
import tempfile
217+
with tempfile.NamedTemporaryFile(suffix='_cap_1718.tif', delete=False) as f:
218+
path = f.name
219+
write(arr, path, compression='deflate', tiled=True, tile_size=16,
220+
cog=True, overview_levels=[1])
221+
with open(path, 'rb') as f:
222+
payload = bytearray(f.read())
223+
224+
header = parse_header(bytes(payload))
225+
ifds = parse_all_ifds(bytes(payload), header)
226+
assert len(ifds) >= 2
227+
228+
# Locate the first IFD's next_ifd_offset slot and rewrite it to a
229+
# far-off value that no buffer growth will ever satisfy.
230+
bo = header.byte_order
231+
first_ifd_off = header.first_ifd_offset
232+
import struct as _struct
233+
num_entries = _struct.unpack_from(f'{bo}H', payload, first_ifd_off)[0]
234+
next_off_pos = first_ifd_off + 2 + num_entries * 12
235+
far = 10**12 # 1 TB, well past any cap
236+
_struct.pack_into(f'{bo}I', payload, next_off_pos, far & 0xFFFFFFFF)
237+
238+
# Shrink the cap so the test is fast.
239+
monkeypatch.setattr(_reader, 'MAX_HTTP_HEADER_BYTES', 64 * 1024)
240+
241+
src = _InMemoryHTTPSource(bytes(payload))
242+
# Wrap read_range so requests past EOF still return the same length
243+
# we already returned (mimics an HTTPS server returning the full
244+
# file when asked for more). Without this the EOF branch short-
245+
# circuits before the cap branch fires.
246+
real_read = src.read_range
247+
248+
def padded_read(start, length):
249+
data = real_read(start, length)
250+
if len(data) < length:
251+
# Pretend the file is longer than it is by zero-padding,
252+
# so the grow loop keeps growing until it hits the cap.
253+
data = data + b'\x00' * (length - len(data))
254+
return data
255+
256+
src.read_range = padded_read # type: ignore[assignment]
257+
258+
with pytest.raises(ValueError, match='MAX_HTTP_HEADER_BYTES'):
259+
_parse_cog_http_meta(src)

0 commit comments

Comments
 (0)