Skip to content

Commit a4d410f

Browse files
authored
geotiff: validate HTTP range responses (#1735) (#1744)
1 parent cc77100 commit a4d410f

2 files changed

Lines changed: 346 additions & 2 deletions

File tree

xrspatial/geotiff/_reader.py

Lines changed: 129 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -680,18 +680,145 @@ def _request(self, headers: dict | None = None):
680680
)
681681

682682
def read_range(self, start: int, length: int) -> bytes:
683+
# Match the ``b''``-for-non-positive-length convention used by
684+
# other source implementations (e.g. ``_BytesIOSource``).
685+
# Without this guard, ``Range: bytes=<start>-<start-1>`` goes on
686+
# the wire, which is an invalid range and triggers a 416 from
687+
# well-behaved servers (or worse, an arbitrarily large 200 body
688+
# from misbehaving ones).
689+
if length <= 0:
690+
return b''
683691
end = start + length - 1
684692
headers = {'Range': f'bytes={start}-{end}'}
685693
if self._pool is not None:
686-
return self._request(headers=headers).data
694+
resp = self._request(headers=headers)
695+
data = resp.data
696+
return self._validate_range_response(
697+
status=resp.status,
698+
content_range=resp.headers.get('Content-Range'),
699+
data=data,
700+
start=start,
701+
length=length,
702+
)
687703
# Fallback: stdlib. urlopen's ``timeout`` is a single value, so
688704
# use the more conservative read timeout; the connect timeout
689705
# isn't separately controllable on stdlib urllib. The opener
690706
# carries ``_ValidatingRedirectHandler`` so 3xx hops are re-
691707
# validated and capped at ``_HTTP_MAX_REDIRECTS``.
692708
req = urllib.request.Request(self._url, headers=headers)
693709
with _get_stdlib_opener().open(req, timeout=self._read_timeout) as resp:
694-
return resp.read()
710+
data = resp.read()
711+
# stdlib raises HTTPError for 4xx/5xx automatically; we still
712+
# need to catch the "server ignored Range and returned 200
713+
# with the whole object" case, plus any short body.
714+
return self._validate_range_response(
715+
status=getattr(resp, 'status', None) or resp.getcode(),
716+
content_range=resp.headers.get('Content-Range'),
717+
data=data,
718+
start=start,
719+
length=length,
720+
)
721+
722+
@staticmethod
723+
def _validate_range_response(*, status, content_range, data,
724+
start: int, length: int) -> bytes:
725+
"""Reject HTTP responses that do not satisfy the Range request.
726+
727+
Without this, three things can go wrong silently (issue #1735):
728+
729+
- the server returns a 4xx/5xx body (urllib3 by default does not
730+
raise on non-2xx, so the bytes would be handed to the caller);
731+
- the server ignores ``Range`` for a non-zero start and returns
732+
the whole object as a 200 with no ``Content-Range``, handing
733+
the codec wrong-offset bytes;
734+
- the body is shorter or longer than what the server advertised
735+
via ``Content-Range``, which surfaces later inside a decoder
736+
as an opaque error far from the real cause.
737+
738+
A short response near EOF is legitimate: a fixed-size header
739+
prefetch (e.g. ``read_range(0, 16384)``) will hit the end of a
740+
smaller file and the server returns ``Content-Range: bytes
741+
0-(size-1)/size``. The validator accepts that as long as the
742+
Content-Range starts at the requested offset and the body
743+
length matches what Content-Range advertises.
744+
745+
Returns the validated bytes, sliced to at most ``length`` bytes
746+
in the "server ignored Range and returned the whole object as
747+
200" case (start == 0, no Content-Range). Other branches return
748+
``data`` unchanged.
749+
"""
750+
if status is None or status not in (200, 206):
751+
raise OSError(
752+
f"HTTP range request returned status {status}; expected "
753+
f"206 Partial Content (or 200 with full body)."
754+
)
755+
if content_range is None:
756+
# No Content-Range. A 200 with no Content-Range is only safe
757+
# when the caller asked for the beginning of the object; for
758+
# any other ``start`` the bytes returned do not correspond
759+
# to the requested offset.
760+
if status == 206:
761+
raise OSError(
762+
"HTTP 206 response missing Content-Range header."
763+
)
764+
if start != 0:
765+
raise OSError(
766+
f"HTTP server returned status 200 with no "
767+
f"Content-Range for a range request starting at "
768+
f"byte {start}; refusing to use the body as a "
769+
f"range fetch."
770+
)
771+
if len(data) < min(length, 1):
772+
# Empty body but caller wanted bytes.
773+
raise OSError("HTTP 200 response body is empty.")
774+
# Server ignored Range and returned the full object as 200.
775+
# The implicit contract is "at most ``length`` bytes"; slice
776+
# so a 16 KiB prefetch against a 2 GiB object doesn't drag
777+
# the whole thing into memory.
778+
if len(data) > length:
779+
return data[:length]
780+
return data
781+
# Parse ``bytes <start>-<end>/<total-or-*>``. Reject anything
782+
# that does not start at the requested offset; allow ``end`` to
783+
# be lower than requested when EOF was hit.
784+
try:
785+
unit, _, spec = content_range.partition(' ')
786+
rng, _, _total = spec.partition('/')
787+
cr_start_s, _, cr_end_s = rng.partition('-')
788+
cr_start = int(cr_start_s)
789+
cr_end = int(cr_end_s)
790+
except ValueError:
791+
raise OSError(
792+
f"HTTP Content-Range header {content_range!r} could "
793+
f"not be parsed."
794+
) from None
795+
if unit != 'bytes':
796+
raise OSError(
797+
f"HTTP Content-Range unit {unit!r} is not 'bytes'."
798+
)
799+
if cr_start != start:
800+
raise OSError(
801+
f"HTTP Content-Range {content_range!r} starts at byte "
802+
f"{cr_start}, but the request started at byte {start}."
803+
)
804+
if cr_end < cr_start:
805+
raise OSError(
806+
f"HTTP Content-Range {content_range!r} has end "
807+
f"({cr_end}) below start ({cr_start})."
808+
)
809+
if cr_end - cr_start + 1 > length:
810+
raise OSError(
811+
f"HTTP Content-Range {content_range!r} advertises more "
812+
f"bytes than were requested (length={length})."
813+
)
814+
expected_len = cr_end - cr_start + 1
815+
if len(data) != expected_len:
816+
raise OSError(
817+
f"HTTP range body length {len(data)} does not match "
818+
f"the {expected_len} bytes advertised by "
819+
f"Content-Range {content_range!r}."
820+
)
821+
return data
695822

696823
def read_ranges(
697824
self,
Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
"""Regression tests for issue #1735.
2+
3+
``_HTTPSource.read_range`` previously returned the response body
4+
without checking the status code, the ``Content-Range`` header, or the
5+
returned byte length. Three failure modes slipped through silently:
6+
7+
- a 200 (Range ignored) or a 4xx/5xx body was handed to the caller as
8+
if it were the requested range,
9+
- a ``Content-Range`` header pointing at a different byte range was
10+
trusted as the requested one,
11+
- a truncated response was passed to a downstream codec where the
12+
decode error appeared far from the real cause.
13+
14+
These tests stand up tiny loopback HTTP servers that misbehave in each
15+
of those ways and assert that ``read_range`` raises a clear ``OSError``.
16+
"""
17+
from __future__ import annotations
18+
19+
import http.server
20+
import socketserver
21+
import threading
22+
23+
import pytest
24+
25+
from xrspatial.geotiff._reader import _HTTPSource
26+
27+
28+
class _BaseHandler(http.server.BaseHTTPRequestHandler):
29+
payload: bytes = b'0' * 64
30+
31+
def log_message(self, *_args, **_kwargs):
32+
pass
33+
34+
35+
def _serve(handler_cls):
36+
httpd = socketserver.TCPServer(('127.0.0.1', 0), handler_cls)
37+
port = httpd.server_address[1]
38+
thread = threading.Thread(target=httpd.serve_forever, daemon=True)
39+
thread.start()
40+
return f'http://127.0.0.1:{port}/x.bin', httpd, thread
41+
42+
43+
def _stop(httpd):
44+
httpd.shutdown()
45+
httpd.server_close()
46+
47+
48+
@pytest.fixture(autouse=True)
49+
def _allow_loopback(monkeypatch):
50+
monkeypatch.setenv('XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS', '1')
51+
52+
53+
def test_range_request_ignored_for_nonzero_start_raises():
54+
"""Server ignores ``Range`` for a non-zero start and returns full
55+
200 -> OSError. (A 200 with start=0 is harmless because the body
56+
offsets line up with what the caller wanted.)"""
57+
58+
class _Handler(_BaseHandler):
59+
def do_GET(self): # noqa: N802
60+
# Ignore Range header; return the full object as 200.
61+
self.send_response(200)
62+
self.send_header('Content-Length', str(len(self.payload)))
63+
self.end_headers()
64+
self.wfile.write(self.payload)
65+
66+
url, httpd, _ = _serve(_Handler)
67+
try:
68+
src = _HTTPSource(url)
69+
with pytest.raises(OSError, match="Content-Range|range fetch"):
70+
src.read_range(8, 16)
71+
finally:
72+
_stop(httpd)
73+
74+
75+
def test_range_request_wrong_content_range_raises():
76+
"""Server returns 206 but the Content-Range header points at the
77+
wrong bytes -> OSError."""
78+
79+
class _Handler(_BaseHandler):
80+
def do_GET(self): # noqa: N802
81+
# Pretend we sent bytes 4-19/64 regardless of what was asked.
82+
self.send_response(206)
83+
self.send_header('Content-Length', '16')
84+
self.send_header('Content-Range', 'bytes 4-19/64')
85+
self.end_headers()
86+
self.wfile.write(self.payload[4:20])
87+
88+
url, httpd, _ = _serve(_Handler)
89+
try:
90+
src = _HTTPSource(url)
91+
# Caller asks for 0-15; server says 4-19.
92+
with pytest.raises(OSError, match="Content-Range"):
93+
src.read_range(0, 16)
94+
finally:
95+
_stop(httpd)
96+
97+
98+
def test_range_request_short_body_raises():
99+
"""Server returns 206 with a body shorter than the requested
100+
length -> OSError."""
101+
102+
class _Handler(_BaseHandler):
103+
def do_GET(self): # noqa: N802
104+
self.send_response(206)
105+
self.send_header('Content-Length', '4')
106+
self.send_header('Content-Range', 'bytes 0-15/64')
107+
self.end_headers()
108+
# Send only 4 bytes despite advertising a 16-byte range.
109+
self.wfile.write(self.payload[:4])
110+
111+
url, httpd, _ = _serve(_Handler)
112+
try:
113+
src = _HTTPSource(url)
114+
with pytest.raises(OSError, match="length"):
115+
src.read_range(0, 16)
116+
finally:
117+
_stop(httpd)
118+
119+
120+
def test_range_request_well_formed_succeeds():
121+
"""A correctly-formed 206 response is accepted as-is."""
122+
123+
class _Handler(_BaseHandler):
124+
def do_GET(self): # noqa: N802
125+
rng = self.headers.get('Range', '')
126+
spec = rng[len('bytes='):]
127+
s, _, e = spec.partition('-')
128+
start = int(s)
129+
end = int(e)
130+
chunk = self.payload[start:end + 1]
131+
self.send_response(206)
132+
self.send_header('Content-Length', str(len(chunk)))
133+
self.send_header(
134+
'Content-Range',
135+
f'bytes {start}-{start + len(chunk) - 1}/'
136+
f'{len(self.payload)}',
137+
)
138+
self.end_headers()
139+
self.wfile.write(chunk)
140+
141+
url, httpd, _ = _serve(_Handler)
142+
try:
143+
src = _HTTPSource(url)
144+
out = src.read_range(8, 16)
145+
assert out == _BaseHandler.payload[8:24]
146+
assert len(out) == 16
147+
finally:
148+
_stop(httpd)
149+
150+
151+
def test_read_range_zero_length_returns_empty_without_request():
152+
"""``read_range(start, 0)`` (and negative ``length``) must short-
153+
circuit to ``b''`` before any HTTP request goes on the wire.
154+
155+
Without the guard, ``Range: bytes=<start>-<start-1>`` is sent, which
156+
is an invalid range and either trips a 416 from a well-behaved
157+
server or pulls down arbitrarily large bytes from a misbehaving one.
158+
Other source implementations (e.g. ``_BytesIOSource``) already
159+
follow the ``b''``-on-non-positive-length convention; this test
160+
pins that contract for ``_HTTPSource`` too.
161+
"""
162+
hit_count = {'n': 0}
163+
164+
class _Handler(_BaseHandler):
165+
def do_GET(self): # noqa: N802
166+
# If we ever land here, the guard failed. Record the hit so
167+
# the assertion below points at the right cause.
168+
hit_count['n'] += 1
169+
self.send_response(200)
170+
self.send_header('Content-Length', str(len(self.payload)))
171+
self.end_headers()
172+
self.wfile.write(self.payload)
173+
174+
url, httpd, _ = _serve(_Handler)
175+
try:
176+
src = _HTTPSource(url)
177+
assert src.read_range(10, 0) == b''
178+
assert src.read_range(0, 0) == b''
179+
assert src.read_range(10, -5) == b''
180+
assert hit_count['n'] == 0, (
181+
"read_range(length<=0) should not issue an HTTP request"
182+
)
183+
finally:
184+
_stop(httpd)
185+
186+
187+
def test_range_ignored_200_with_full_body_is_sliced_to_length():
188+
"""Server ignores ``Range`` for ``start=0`` and returns the full
189+
object as a 200 with no ``Content-Range`` -> response is sliced to
190+
the requested length.
191+
192+
The implicit contract of ``read_range`` is "at most ``length``
193+
bytes". A 16 KiB header prefetch against a 2 GiB object must not
194+
drag the whole thing into memory when the server misbehaves.
195+
"""
196+
197+
class _Handler(_BaseHandler):
198+
# Payload larger than the requested length so we can verify the
199+
# slice happens.
200+
payload = b'\xab' * 4096
201+
202+
def do_GET(self): # noqa: N802
203+
# Ignore Range entirely; return the full object as 200.
204+
self.send_response(200)
205+
self.send_header('Content-Length', str(len(self.payload)))
206+
self.end_headers()
207+
self.wfile.write(self.payload)
208+
209+
url, httpd, _ = _serve(_Handler)
210+
try:
211+
src = _HTTPSource(url)
212+
# start=0 -> validator should slice rather than raise.
213+
out = src.read_range(0, 64)
214+
assert len(out) == 64
215+
assert out == _Handler.payload[:64]
216+
finally:
217+
_stop(httpd)

0 commit comments

Comments
 (0)