Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 96 additions & 14 deletions xrspatial/geotiff/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -1518,33 +1518,115 @@ def _decode_one(job):
# COG HTTP reader
# ---------------------------------------------------------------------------

#: Initial prefetch size for ``_parse_cog_http_meta``. Sized for the common
#: case (a single-IFD COG with modest GeoTIFF tags) so the fast path is a
#: single range GET.
INITIAL_HTTP_HEADER_BYTES = 16 * 1024

#: Upper bound on how far ``_parse_cog_http_meta`` will grow its prefetch
#: buffer before giving up. 4 MiB comfortably covers deep pyramids whose
#: IFD chains plus tag arrays (TileOffsets, GeoAsciiParams, GDAL_METADATA)
#: extend far past the initial fetch window. See issue #1718.
MAX_HTTP_HEADER_BYTES = 4 * 1024 * 1024


def _ifd_required_extent(
ifds: list[IFD], header: TIFFHeader, data_len: int,
) -> int:
"""Return the highest byte offset the parsed IFDs reference.

Used to decide whether the prefetch buffer is large enough to hold the
entire IFD chain plus every out-of-line tag value. We compare this
against ``len(data)`` in :func:`_parse_cog_http_meta`; if it exceeds the
buffer, the chain is truncated and the caller must grow and retry.

The walk re-derives each tag's value-area placement directly from the
IFD layout (entry table base + entry slot) rather than re-parsing the
raw bytes. For out-of-line tags ``parse_ifd`` already resolved the
pointer and validated ``ptr + size <= data_len``; the *interesting*
extent for the grow loop is the next-IFD pointer of the chain tail,
plus an "is there a next IFD we have not yet seen" probe.
Comment on lines +1536 to +1548
"""
if not ifds:
return 0

required = 0
# Last IFD's next_ifd_offset: 0 means end-of-chain; anything else
# points at an IFD we haven't parsed yet because it sat past the
# buffer (parse_all_ifds stops on offset >= len(data)).
tail_next = ifds[-1].next_ifd_offset
if tail_next != 0:
# Need at least enough bytes to reach the next IFD header. Pad
# by a small amount so parse_ifd can read the num_entries field
# without truncation -- the actual entry table is bounded by the
# parser's own checks on the next grow iteration.
required = max(required, tail_next + 64)

# Out-of-line tag values are already parsed (parse_ifd bounds-checked
# ptr + total_size <= len(data) before reading). For grow logic we
# only need to ensure those checks did not *fail*; a thrown
# ValueError surfaces in parse_all_ifds and is handled by the loop.
return required


def _parse_cog_http_meta(
source: _HTTPSource,
overview_level: int | None = None,
) -> tuple[TIFFHeader, IFD, GeoInfo, bytes]:
"""Fetch + parse the leading IFDs of an HTTP COG once.

Issues one (or rarely two) range GET(s) for the leading 16 KB / 64 KB
of the file, parses the header and IFD list, and returns the selected
IFD plus the raw header bytes (kept for ``extract_geo_info`` callers
that might want the IFD's tag offsets).
The fast path is a single 16 KiB range GET. When the IFD chain or its
out-of-line tag arrays extend past that window the buffer is doubled
and reparsed until either the chain is fully resolved or the cap at
:data:`MAX_HTTP_HEADER_BYTES` is reached. Real COGs whose pyramid
metadata legitimately exceeds the cap need a different strategy
(lazy per-IFD reads); the cap exists to bound a malformed-file blast
radius rather than to constrain valid pyramids.

Pulled out of :func:`_read_cog_http` so :func:`read_geotiff_dask`
can parse metadata once per graph rather than once per chunk task
(P5: each delayed task used to fire its own 16 KB header GET).
"""
header_bytes = source.read_range(0, 16384)
fetch_size = INITIAL_HTTP_HEADER_BYTES
header_bytes = source.read_range(0, fetch_size)
header = parse_header(header_bytes)
ifds = parse_all_ifds(header_bytes, header)

# parse_all_ifds bails the moment it walks past the bytes we
# fetched, so a header GET that lands short of the first IFD's
# offset returns an empty list. Retry with a larger window in that
# case; this is *not* a partial-IFD recovery (overviews chained
# past the first 16 KiB are still loaded lazily by other readers).
if len(ifds) == 0:
header_bytes = source.read_range(0, 65536)
ifds = parse_all_ifds(header_bytes, header)
last_len = len(header_bytes)
ifds: list[IFD] = []
while True:
try:
ifds = parse_all_ifds(header_bytes, header)
required = _ifd_required_extent(ifds, header, len(header_bytes))
# Chain is fully resolved when every IFD parsed cleanly and
# the tail next_ifd_offset is reachable within the buffer
# (required == 0 means end-of-chain).
if ifds and required <= len(header_bytes):
break
except ValueError:
# parse_ifd raises when an out-of-line tag points past the
# buffer. Treat it the same as a truncated chain: grow and
# retry. If we are already at the cap and still failing, let
# the next iteration's cap check raise a clear error.
ifds = []

Comment on lines +1596 to +1611
if fetch_size >= MAX_HTTP_HEADER_BYTES:
raise ValueError(
f"COG IFD chain or tag arrays extend past "
f"MAX_HTTP_HEADER_BYTES={MAX_HTTP_HEADER_BYTES} bytes; "
f"the file may be malformed or its pyramid metadata is "
f"unusually large for HTTP prefetch")
fetch_size = min(fetch_size * 2, MAX_HTTP_HEADER_BYTES)
header_bytes = source.read_range(0, fetch_size)
# Server returned the same number of bytes as last time: we have
# hit EOF on the underlying file. No point growing further; if
# the IFD chain still doesn't resolve, the file is truncated.
if len(header_bytes) == last_len:
try:
ifds = parse_all_ifds(header_bytes, header)
except ValueError:
ifds = []
Comment on lines +1626 to +1627
break
last_len = len(header_bytes)

if len(ifds) == 0:
raise ValueError("No IFDs found in COG")
Expand Down
259 changes: 259 additions & 0 deletions xrspatial/geotiff/tests/test_http_meta_buffer_1718.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,259 @@
"""Tests for HTTP COG metadata prefetch growing past 64 KiB (issue #1718).

The fast path is a single 16 KiB GET; if the IFD chain or its out-of-line
tag arrays (TileOffsets, GeoAsciiParams, GDAL_METADATA) extend past that
window the prefetch buffer doubles until everything fits or it hits
:data:`MAX_HTTP_HEADER_BYTES`.
"""
from __future__ import annotations

import http.server
import socketserver
import threading

import numpy as np
import pytest

from xrspatial.geotiff._header import parse_all_ifds, parse_header
from xrspatial.geotiff._reader import (
INITIAL_HTTP_HEADER_BYTES,
MAX_HTTP_HEADER_BYTES,
_HTTPSource,
_parse_cog_http_meta,
_read_cog_http,
)
from xrspatial.geotiff._writer import write


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

class _InMemoryHTTPSource(_HTTPSource):
"""_HTTPSource backed by an in-memory bytes buffer.

Counts ``read_range`` calls so tests can lock in the fast-path
invariant (one GET for headers that fit in 16 KiB).
"""

def __init__(self, payload: bytes):
# Skip super().__init__ -- no network, no SSRF validation needed.
self._url = 'memory://test'
self._size = len(payload)
self._pool = None
self._payload = payload
self.read_range_calls: list[tuple[int, int]] = []

def read_range(self, start: int, length: int) -> bytes:
self.read_range_calls.append((start, length))
return self._payload[start:start + length]


class _RangeHandler1718(http.server.BaseHTTPRequestHandler):
payload: bytes = b''

def do_GET(self): # noqa: N802
rng = self.headers.get('Range')
if rng and rng.startswith('bytes='):
spec = rng[len('bytes='):]
start_s, _, end_s = spec.partition('-')
start = int(start_s)
end = int(end_s) if end_s else len(self.payload) - 1
chunk = self.payload[start:end + 1]
self.send_response(206)
self.send_header('Content-Type', 'application/octet-stream')
self.send_header(
'Content-Range',
f'bytes {start}-{start + len(chunk) - 1}/{len(self.payload)}',
)
self.send_header('Content-Length', str(len(chunk)))
self.end_headers()
self.wfile.write(chunk)
return
self.send_response(200)
self.send_header('Content-Type', 'application/octet-stream')
self.send_header('Content-Length', str(len(self.payload)))
self.end_headers()
self.wfile.write(self.payload)

def log_message(self, *_args, **_kwargs):
pass


def _serve(payload: bytes):
handler_cls = type(
'RangeHandler1718Bound', (_RangeHandler1718,), {'payload': payload}
)
httpd = socketserver.TCPServer(('127.0.0.1', 0), handler_cls)
thread = threading.Thread(target=httpd.serve_forever, daemon=True)
thread.start()
return httpd, thread


def _write_cog_with_big_metadata(path: str, arr: np.ndarray,
metadata_pad_bytes: int) -> None:
"""Write a multi-overview COG whose level-0 IFD carries a huge
GDAL_METADATA tag, pushing the chained overview IFDs past 64 KiB."""
# GDAL_METADATA is stored as an out-of-line ASCII tag value when
# large; a multi-kilobyte payload pads the value area between the
# first IFD and its overviews, forcing the rest of the chain past
# the 16 KiB / 64 KiB prefetch windows.
big_xml = (
'<GDALMetadata>'
+ '<Item name="filler">' + 'x' * metadata_pad_bytes + '</Item>'
+ '</GDALMetadata>'
)
write(arr, path, compression='deflate', tiled=True, tile_size=64,
cog=True, overview_levels=[2, 4, 8],
gdal_metadata_xml=big_xml)


# ---------------------------------------------------------------------------
# Fast path: small COG should fire a single 16 KiB read
# ---------------------------------------------------------------------------

def test_small_cog_uses_single_initial_read(tmp_path):
arr = np.arange(64 * 64, dtype=np.float32).reshape(64, 64)
path = str(tmp_path / 'small_1718_cog.tif')
write(arr, path, compression='deflate', tiled=True, tile_size=32,
cog=True, overview_levels=[1])

with open(path, 'rb') as f:
payload = f.read()

src = _InMemoryHTTPSource(payload)
header, ifd, geo_info, header_bytes = _parse_cog_http_meta(src)

# Fast path is exactly one read_range at the initial size.
assert len(src.read_range_calls) == 1
assert src.read_range_calls[0] == (0, INITIAL_HTTP_HEADER_BYTES)
# And the buffer fully resolves the chain.
parsed_ifds = parse_all_ifds(header_bytes, header)
assert parsed_ifds[-1].next_ifd_offset == 0


# ---------------------------------------------------------------------------
# Grow path: COG whose IFD chain extends past 64 KiB still parses
# ---------------------------------------------------------------------------

def test_ifd_chain_past_64kib_resolves(tmp_path):
arr = np.arange(256 * 256, dtype=np.float32).reshape(256, 256)
path = str(tmp_path / 'big_meta_1718_cog.tif')
# 96 KiB of XML padding guarantees subsequent IFDs land well past
# both the 16 KiB initial fetch and the legacy 64 KiB retry.
_write_cog_with_big_metadata(path, arr, metadata_pad_bytes=96 * 1024)

with open(path, 'rb') as f:
payload = f.read()

# Sanity: the second IFD's offset really does sit past 64 KiB,
# otherwise this test is not exercising the grow loop.
header = parse_header(payload)
full_ifds = parse_all_ifds(payload, header)
assert len(full_ifds) >= 2, "fixture must have >=2 IFDs"
assert full_ifds[0].next_ifd_offset > 65536, (
"fixture must place IFD #2 past 64 KiB to exercise the grow loop; "
f"got next_ifd_offset={full_ifds[0].next_ifd_offset}"
)

src = _InMemoryHTTPSource(payload)
_, _, _, header_bytes = _parse_cog_http_meta(src)

grown_ifds = parse_all_ifds(header_bytes, header)
assert len(grown_ifds) == len(full_ifds), (
f"prefetch buffer lost IFDs: got {len(grown_ifds)} of {len(full_ifds)}"
)
# Multiple read_range calls confirm the buffer actually grew.
assert len(src.read_range_calls) > 1
# And it did not blow past the cap.
assert src.read_range_calls[-1][1] <= MAX_HTTP_HEADER_BYTES


def test_end_to_end_http_read_with_big_metadata(tmp_path, monkeypatch):
"""_read_cog_http should match local read on a >64 KiB IFD-chain COG."""
monkeypatch.setenv('XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS', '1')
arr = np.arange(256 * 256, dtype=np.float32).reshape(256, 256)
path = str(tmp_path / 'http_big_1718_cog.tif')
_write_cog_with_big_metadata(path, arr, metadata_pad_bytes=96 * 1024)

with open(path, 'rb') as f:
payload = f.read()

httpd, _thread = _serve(payload)
port = httpd.server_address[1]
try:
url = f'http://127.0.0.1:{port}/cog.tif'
result, _geo = _read_cog_http(url)
np.testing.assert_array_equal(result, arr)

# Overview read on the same URL must also succeed.
result_ov, _ = _read_cog_http(url, overview_level=1)
assert result_ov.shape[0] < arr.shape[0]
finally:
httpd.shutdown()
httpd.server_close()


# ---------------------------------------------------------------------------
# Truncation / cap behaviour
# ---------------------------------------------------------------------------

def test_cap_raises_clear_error_on_excessive_chain(monkeypatch):
"""When the IFD chain refuses to fit, hitting the cap raises ValueError.

Patches MAX_HTTP_HEADER_BYTES tiny so the test does not need to
fabricate a multi-megabyte payload to exercise the cap branch.
"""
from xrspatial.geotiff import _reader

# Build a payload whose first IFD's next-IFD offset deliberately
# points to a huge address we will never reach. parse_all_ifds will
# return the first IFD but tail_next > buffer, forcing the grow loop.
# The payload itself is small so the server EOF branch is not what
# raises -- we want the cap branch.
arr = np.arange(64 * 64, dtype=np.float32).reshape(64, 64)
# In-memory write
import tempfile
with tempfile.NamedTemporaryFile(suffix='_cap_1718.tif', delete=False) as f:
path = f.name
write(arr, path, compression='deflate', tiled=True, tile_size=16,
cog=True, overview_levels=[1])
with open(path, 'rb') as f:
payload = bytearray(f.read())

Comment on lines +216 to +223
header = parse_header(bytes(payload))
ifds = parse_all_ifds(bytes(payload), header)
assert len(ifds) >= 2

# Locate the first IFD's next_ifd_offset slot and rewrite it to a
# far-off value that no buffer growth will ever satisfy.
bo = header.byte_order
first_ifd_off = header.first_ifd_offset
import struct as _struct
num_entries = _struct.unpack_from(f'{bo}H', payload, first_ifd_off)[0]
next_off_pos = first_ifd_off + 2 + num_entries * 12
far = 10**12 # 1 TB, well past any cap
_struct.pack_into(f'{bo}I', payload, next_off_pos, far & 0xFFFFFFFF)
Comment on lines +229 to +236

# Shrink the cap so the test is fast.
monkeypatch.setattr(_reader, 'MAX_HTTP_HEADER_BYTES', 64 * 1024)

src = _InMemoryHTTPSource(bytes(payload))
# Wrap read_range so requests past EOF still return the same length
# we already returned (mimics an HTTPS server returning the full
# file when asked for more). Without this the EOF branch short-
# circuits before the cap branch fires.
real_read = src.read_range

def padded_read(start, length):
data = real_read(start, length)
if len(data) < length:
# Pretend the file is longer than it is by zero-padding,
# so the grow loop keeps growing until it hits the cap.
data = data + b'\x00' * (length - len(data))
return data

src.read_range = padded_read # type: ignore[assignment]

with pytest.raises(ValueError, match='MAX_HTTP_HEADER_BYTES'):
_parse_cog_http_meta(src)
Loading