diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py index 0c2071b2..9ee4b30e 100644 --- a/xrspatial/geotiff/_reader.py +++ b/xrspatial/geotiff/_reader.py @@ -1518,33 +1518,115 @@ def _decode_one(job): # COG HTTP reader # --------------------------------------------------------------------------- +#: Initial prefetch size for ``_parse_cog_http_meta``. Sized for the common +#: case (a single-IFD COG with modest GeoTIFF tags) so the fast path is a +#: single range GET. +INITIAL_HTTP_HEADER_BYTES = 16 * 1024 + +#: Upper bound on how far ``_parse_cog_http_meta`` will grow its prefetch +#: buffer before giving up. 4 MiB comfortably covers deep pyramids whose +#: IFD chains plus tag arrays (TileOffsets, GeoAsciiParams, GDAL_METADATA) +#: extend far past the initial fetch window. See issue #1718. +MAX_HTTP_HEADER_BYTES = 4 * 1024 * 1024 + + +def _ifd_required_extent( + ifds: list[IFD], header: TIFFHeader, data_len: int, +) -> int: + """Return the highest byte offset the parsed IFDs reference. + + Used to decide whether the prefetch buffer is large enough to hold the + entire IFD chain plus every out-of-line tag value. We compare this + against ``len(data)`` in :func:`_parse_cog_http_meta`; if it exceeds the + buffer, the chain is truncated and the caller must grow and retry. + + The walk re-derives each tag's value-area placement directly from the + IFD layout (entry table base + entry slot) rather than re-parsing the + raw bytes. For out-of-line tags ``parse_ifd`` already resolved the + pointer and validated ``ptr + size <= data_len``; the *interesting* + extent for the grow loop is the next-IFD pointer of the chain tail, + plus an "is there a next IFD we have not yet seen" probe. + """ + if not ifds: + return 0 + + required = 0 + # Last IFD's next_ifd_offset: 0 means end-of-chain; anything else + # points at an IFD we haven't parsed yet because it sat past the + # buffer (parse_all_ifds stops on offset >= len(data)). + tail_next = ifds[-1].next_ifd_offset + if tail_next != 0: + # Need at least enough bytes to reach the next IFD header. Pad + # by a small amount so parse_ifd can read the num_entries field + # without truncation -- the actual entry table is bounded by the + # parser's own checks on the next grow iteration. + required = max(required, tail_next + 64) + + # Out-of-line tag values are already parsed (parse_ifd bounds-checked + # ptr + total_size <= len(data) before reading). For grow logic we + # only need to ensure those checks did not *fail*; a thrown + # ValueError surfaces in parse_all_ifds and is handled by the loop. + return required + + def _parse_cog_http_meta( source: _HTTPSource, overview_level: int | None = None, ) -> tuple[TIFFHeader, IFD, GeoInfo, bytes]: """Fetch + parse the leading IFDs of an HTTP COG once. - Issues one (or rarely two) range GET(s) for the leading 16 KB / 64 KB - of the file, parses the header and IFD list, and returns the selected - IFD plus the raw header bytes (kept for ``extract_geo_info`` callers - that might want the IFD's tag offsets). + The fast path is a single 16 KiB range GET. When the IFD chain or its + out-of-line tag arrays extend past that window the buffer is doubled + and reparsed until either the chain is fully resolved or the cap at + :data:`MAX_HTTP_HEADER_BYTES` is reached. Real COGs whose pyramid + metadata legitimately exceeds the cap need a different strategy + (lazy per-IFD reads); the cap exists to bound a malformed-file blast + radius rather than to constrain valid pyramids. Pulled out of :func:`_read_cog_http` so :func:`read_geotiff_dask` can parse metadata once per graph rather than once per chunk task (P5: each delayed task used to fire its own 16 KB header GET). """ - header_bytes = source.read_range(0, 16384) + fetch_size = INITIAL_HTTP_HEADER_BYTES + header_bytes = source.read_range(0, fetch_size) header = parse_header(header_bytes) - ifds = parse_all_ifds(header_bytes, header) - # parse_all_ifds bails the moment it walks past the bytes we - # fetched, so a header GET that lands short of the first IFD's - # offset returns an empty list. Retry with a larger window in that - # case; this is *not* a partial-IFD recovery (overviews chained - # past the first 16 KiB are still loaded lazily by other readers). - if len(ifds) == 0: - header_bytes = source.read_range(0, 65536) - ifds = parse_all_ifds(header_bytes, header) + last_len = len(header_bytes) + ifds: list[IFD] = [] + while True: + try: + ifds = parse_all_ifds(header_bytes, header) + required = _ifd_required_extent(ifds, header, len(header_bytes)) + # Chain is fully resolved when every IFD parsed cleanly and + # the tail next_ifd_offset is reachable within the buffer + # (required == 0 means end-of-chain). + if ifds and required <= len(header_bytes): + break + except ValueError: + # parse_ifd raises when an out-of-line tag points past the + # buffer. Treat it the same as a truncated chain: grow and + # retry. If we are already at the cap and still failing, let + # the next iteration's cap check raise a clear error. + ifds = [] + + if fetch_size >= MAX_HTTP_HEADER_BYTES: + raise ValueError( + f"COG IFD chain or tag arrays extend past " + f"MAX_HTTP_HEADER_BYTES={MAX_HTTP_HEADER_BYTES} bytes; " + f"the file may be malformed or its pyramid metadata is " + f"unusually large for HTTP prefetch") + fetch_size = min(fetch_size * 2, MAX_HTTP_HEADER_BYTES) + header_bytes = source.read_range(0, fetch_size) + # Server returned the same number of bytes as last time: we have + # hit EOF on the underlying file. No point growing further; if + # the IFD chain still doesn't resolve, the file is truncated. + if len(header_bytes) == last_len: + try: + ifds = parse_all_ifds(header_bytes, header) + except ValueError: + ifds = [] + break + last_len = len(header_bytes) if len(ifds) == 0: raise ValueError("No IFDs found in COG") diff --git a/xrspatial/geotiff/tests/test_http_meta_buffer_1718.py b/xrspatial/geotiff/tests/test_http_meta_buffer_1718.py new file mode 100644 index 00000000..928d89a9 --- /dev/null +++ b/xrspatial/geotiff/tests/test_http_meta_buffer_1718.py @@ -0,0 +1,259 @@ +"""Tests for HTTP COG metadata prefetch growing past 64 KiB (issue #1718). + +The fast path is a single 16 KiB GET; if the IFD chain or its out-of-line +tag arrays (TileOffsets, GeoAsciiParams, GDAL_METADATA) extend past that +window the prefetch buffer doubles until everything fits or it hits +:data:`MAX_HTTP_HEADER_BYTES`. +""" +from __future__ import annotations + +import http.server +import socketserver +import threading + +import numpy as np +import pytest + +from xrspatial.geotiff._header import parse_all_ifds, parse_header +from xrspatial.geotiff._reader import ( + INITIAL_HTTP_HEADER_BYTES, + MAX_HTTP_HEADER_BYTES, + _HTTPSource, + _parse_cog_http_meta, + _read_cog_http, +) +from xrspatial.geotiff._writer import write + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +class _InMemoryHTTPSource(_HTTPSource): + """_HTTPSource backed by an in-memory bytes buffer. + + Counts ``read_range`` calls so tests can lock in the fast-path + invariant (one GET for headers that fit in 16 KiB). + """ + + def __init__(self, payload: bytes): + # Skip super().__init__ -- no network, no SSRF validation needed. + self._url = 'memory://test' + self._size = len(payload) + self._pool = None + self._payload = payload + self.read_range_calls: list[tuple[int, int]] = [] + + def read_range(self, start: int, length: int) -> bytes: + self.read_range_calls.append((start, length)) + return self._payload[start:start + length] + + +class _RangeHandler1718(http.server.BaseHTTPRequestHandler): + payload: bytes = b'' + + def do_GET(self): # noqa: N802 + rng = self.headers.get('Range') + if rng and rng.startswith('bytes='): + spec = rng[len('bytes='):] + start_s, _, end_s = spec.partition('-') + start = int(start_s) + end = int(end_s) if end_s else len(self.payload) - 1 + chunk = self.payload[start:end + 1] + self.send_response(206) + self.send_header('Content-Type', 'application/octet-stream') + self.send_header( + 'Content-Range', + f'bytes {start}-{start + len(chunk) - 1}/{len(self.payload)}', + ) + self.send_header('Content-Length', str(len(chunk))) + self.end_headers() + self.wfile.write(chunk) + return + self.send_response(200) + self.send_header('Content-Type', 'application/octet-stream') + self.send_header('Content-Length', str(len(self.payload))) + self.end_headers() + self.wfile.write(self.payload) + + def log_message(self, *_args, **_kwargs): + pass + + +def _serve(payload: bytes): + handler_cls = type( + 'RangeHandler1718Bound', (_RangeHandler1718,), {'payload': payload} + ) + httpd = socketserver.TCPServer(('127.0.0.1', 0), handler_cls) + thread = threading.Thread(target=httpd.serve_forever, daemon=True) + thread.start() + return httpd, thread + + +def _write_cog_with_big_metadata(path: str, arr: np.ndarray, + metadata_pad_bytes: int) -> None: + """Write a multi-overview COG whose level-0 IFD carries a huge + GDAL_METADATA tag, pushing the chained overview IFDs past 64 KiB.""" + # GDAL_METADATA is stored as an out-of-line ASCII tag value when + # large; a multi-kilobyte payload pads the value area between the + # first IFD and its overviews, forcing the rest of the chain past + # the 16 KiB / 64 KiB prefetch windows. + big_xml = ( + '' + + '' + 'x' * metadata_pad_bytes + '' + + '' + ) + write(arr, path, compression='deflate', tiled=True, tile_size=64, + cog=True, overview_levels=[2, 4, 8], + gdal_metadata_xml=big_xml) + + +# --------------------------------------------------------------------------- +# Fast path: small COG should fire a single 16 KiB read +# --------------------------------------------------------------------------- + +def test_small_cog_uses_single_initial_read(tmp_path): + arr = np.arange(64 * 64, dtype=np.float32).reshape(64, 64) + path = str(tmp_path / 'small_1718_cog.tif') + write(arr, path, compression='deflate', tiled=True, tile_size=32, + cog=True, overview_levels=[1]) + + with open(path, 'rb') as f: + payload = f.read() + + src = _InMemoryHTTPSource(payload) + header, ifd, geo_info, header_bytes = _parse_cog_http_meta(src) + + # Fast path is exactly one read_range at the initial size. + assert len(src.read_range_calls) == 1 + assert src.read_range_calls[0] == (0, INITIAL_HTTP_HEADER_BYTES) + # And the buffer fully resolves the chain. + parsed_ifds = parse_all_ifds(header_bytes, header) + assert parsed_ifds[-1].next_ifd_offset == 0 + + +# --------------------------------------------------------------------------- +# Grow path: COG whose IFD chain extends past 64 KiB still parses +# --------------------------------------------------------------------------- + +def test_ifd_chain_past_64kib_resolves(tmp_path): + arr = np.arange(256 * 256, dtype=np.float32).reshape(256, 256) + path = str(tmp_path / 'big_meta_1718_cog.tif') + # 96 KiB of XML padding guarantees subsequent IFDs land well past + # both the 16 KiB initial fetch and the legacy 64 KiB retry. + _write_cog_with_big_metadata(path, arr, metadata_pad_bytes=96 * 1024) + + with open(path, 'rb') as f: + payload = f.read() + + # Sanity: the second IFD's offset really does sit past 64 KiB, + # otherwise this test is not exercising the grow loop. + header = parse_header(payload) + full_ifds = parse_all_ifds(payload, header) + assert len(full_ifds) >= 2, "fixture must have >=2 IFDs" + assert full_ifds[0].next_ifd_offset > 65536, ( + "fixture must place IFD #2 past 64 KiB to exercise the grow loop; " + f"got next_ifd_offset={full_ifds[0].next_ifd_offset}" + ) + + src = _InMemoryHTTPSource(payload) + _, _, _, header_bytes = _parse_cog_http_meta(src) + + grown_ifds = parse_all_ifds(header_bytes, header) + assert len(grown_ifds) == len(full_ifds), ( + f"prefetch buffer lost IFDs: got {len(grown_ifds)} of {len(full_ifds)}" + ) + # Multiple read_range calls confirm the buffer actually grew. + assert len(src.read_range_calls) > 1 + # And it did not blow past the cap. + assert src.read_range_calls[-1][1] <= MAX_HTTP_HEADER_BYTES + + +def test_end_to_end_http_read_with_big_metadata(tmp_path, monkeypatch): + """_read_cog_http should match local read on a >64 KiB IFD-chain COG.""" + monkeypatch.setenv('XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS', '1') + arr = np.arange(256 * 256, dtype=np.float32).reshape(256, 256) + path = str(tmp_path / 'http_big_1718_cog.tif') + _write_cog_with_big_metadata(path, arr, metadata_pad_bytes=96 * 1024) + + with open(path, 'rb') as f: + payload = f.read() + + httpd, _thread = _serve(payload) + port = httpd.server_address[1] + try: + url = f'http://127.0.0.1:{port}/cog.tif' + result, _geo = _read_cog_http(url) + np.testing.assert_array_equal(result, arr) + + # Overview read on the same URL must also succeed. + result_ov, _ = _read_cog_http(url, overview_level=1) + assert result_ov.shape[0] < arr.shape[0] + finally: + httpd.shutdown() + httpd.server_close() + + +# --------------------------------------------------------------------------- +# Truncation / cap behaviour +# --------------------------------------------------------------------------- + +def test_cap_raises_clear_error_on_excessive_chain(monkeypatch): + """When the IFD chain refuses to fit, hitting the cap raises ValueError. + + Patches MAX_HTTP_HEADER_BYTES tiny so the test does not need to + fabricate a multi-megabyte payload to exercise the cap branch. + """ + from xrspatial.geotiff import _reader + + # Build a payload whose first IFD's next-IFD offset deliberately + # points to a huge address we will never reach. parse_all_ifds will + # return the first IFD but tail_next > buffer, forcing the grow loop. + # The payload itself is small so the server EOF branch is not what + # raises -- we want the cap branch. + arr = np.arange(64 * 64, dtype=np.float32).reshape(64, 64) + # In-memory write + import tempfile + with tempfile.NamedTemporaryFile(suffix='_cap_1718.tif', delete=False) as f: + path = f.name + write(arr, path, compression='deflate', tiled=True, tile_size=16, + cog=True, overview_levels=[1]) + with open(path, 'rb') as f: + payload = bytearray(f.read()) + + header = parse_header(bytes(payload)) + ifds = parse_all_ifds(bytes(payload), header) + assert len(ifds) >= 2 + + # Locate the first IFD's next_ifd_offset slot and rewrite it to a + # far-off value that no buffer growth will ever satisfy. + bo = header.byte_order + first_ifd_off = header.first_ifd_offset + import struct as _struct + num_entries = _struct.unpack_from(f'{bo}H', payload, first_ifd_off)[0] + next_off_pos = first_ifd_off + 2 + num_entries * 12 + far = 10**12 # 1 TB, well past any cap + _struct.pack_into(f'{bo}I', payload, next_off_pos, far & 0xFFFFFFFF) + + # Shrink the cap so the test is fast. + monkeypatch.setattr(_reader, 'MAX_HTTP_HEADER_BYTES', 64 * 1024) + + src = _InMemoryHTTPSource(bytes(payload)) + # Wrap read_range so requests past EOF still return the same length + # we already returned (mimics an HTTPS server returning the full + # file when asked for more). Without this the EOF branch short- + # circuits before the cap branch fires. + real_read = src.read_range + + def padded_read(start, length): + data = real_read(start, length) + if len(data) < length: + # Pretend the file is longer than it is by zero-padding, + # so the grow loop keeps growing until it hits the cap. + data = data + b'\x00' * (length - len(data)) + return data + + src.read_range = padded_read # type: ignore[assignment] + + with pytest.raises(ValueError, match='MAX_HTTP_HEADER_BYTES'): + _parse_cog_http_meta(src)