Skip to content

Commit cdf2218

Browse files
committed
geotiff: validate HTTP range responses (#1735)
_HTTPSource.read_range previously handed the response body to the caller without checking the status code, Content-Range header, or returned byte length. Three failure modes slipped through silently: - urllib3 by default does not raise on non-2xx; a 4xx/5xx body reached the codec as if it were the requested range. - A server that ignored Range and returned the whole object as a 200 with no Content-Range was trusted for any non-zero start, handing the codec wrong-offset bytes. - A truncated body, or one whose Content-Range header did not match the request, surfaced as an opaque decode error far from the real cause. Add _validate_range_response that checks status, parses Content-Range, and confirms the body length matches what Content-Range advertises. Short reads near EOF stay legitimate: the validator only requires the Content-Range to start at the requested offset and the body length to match what the server itself reported. Applied to both the urllib3 and stdlib urllib code paths. Adds regression coverage in test_http_range_validation_1735.py.
1 parent 1624d13 commit cdf2218

2 files changed

Lines changed: 259 additions & 2 deletions

File tree

xrspatial/geotiff/_reader.py

Lines changed: 111 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -683,15 +683,124 @@ def read_range(self, start: int, length: int) -> bytes:
683683
end = start + length - 1
684684
headers = {'Range': f'bytes={start}-{end}'}
685685
if self._pool is not None:
686-
return self._request(headers=headers).data
686+
resp = self._request(headers=headers)
687+
data = resp.data
688+
self._validate_range_response(
689+
status=resp.status,
690+
content_range=resp.headers.get('Content-Range'),
691+
data=data,
692+
start=start,
693+
length=length,
694+
)
695+
return data
687696
# Fallback: stdlib. urlopen's ``timeout`` is a single value, so
688697
# use the more conservative read timeout; the connect timeout
689698
# isn't separately controllable on stdlib urllib. The opener
690699
# carries ``_ValidatingRedirectHandler`` so 3xx hops are re-
691700
# validated and capped at ``_HTTP_MAX_REDIRECTS``.
692701
req = urllib.request.Request(self._url, headers=headers)
693702
with _get_stdlib_opener().open(req, timeout=self._read_timeout) as resp:
694-
return resp.read()
703+
data = resp.read()
704+
# stdlib raises HTTPError for 4xx/5xx automatically; we still
705+
# need to catch the "server ignored Range and returned 200
706+
# with the whole object" case, plus any short body.
707+
self._validate_range_response(
708+
status=getattr(resp, 'status', None) or resp.getcode(),
709+
content_range=resp.headers.get('Content-Range'),
710+
data=data,
711+
start=start,
712+
length=length,
713+
)
714+
return data
715+
716+
@staticmethod
717+
def _validate_range_response(*, status, content_range, data,
718+
start: int, length: int) -> None:
719+
"""Reject HTTP responses that do not satisfy the Range request.
720+
721+
Without this, three things can go wrong silently (issue #1735):
722+
723+
- the server returns a 4xx/5xx body (urllib3 by default does not
724+
raise on non-2xx, so the bytes would be handed to the caller);
725+
- the server ignores ``Range`` for a non-zero start and returns
726+
the whole object as a 200 with no ``Content-Range``, handing
727+
the codec wrong-offset bytes;
728+
- the body is shorter or longer than what the server advertised
729+
via ``Content-Range``, which surfaces later inside a decoder
730+
as an opaque error far from the real cause.
731+
732+
A short response near EOF is legitimate: a fixed-size header
733+
prefetch (e.g. ``read_range(0, 16384)``) will hit the end of a
734+
smaller file and the server returns ``Content-Range: bytes
735+
0-(size-1)/size``. The validator accepts that as long as the
736+
Content-Range starts at the requested offset and the body
737+
length matches what Content-Range advertises.
738+
"""
739+
if status is None or status not in (200, 206):
740+
raise OSError(
741+
f"HTTP range request returned status {status}; expected "
742+
f"206 Partial Content (or 200 with full body)."
743+
)
744+
if content_range is None:
745+
# No Content-Range. A 200 with no Content-Range is only safe
746+
# when the caller asked for the beginning of the object; for
747+
# any other ``start`` the bytes returned do not correspond
748+
# to the requested offset.
749+
if status == 206:
750+
raise OSError(
751+
"HTTP 206 response missing Content-Range header."
752+
)
753+
if start != 0:
754+
raise OSError(
755+
f"HTTP server returned status 200 with no "
756+
f"Content-Range for a range request starting at "
757+
f"byte {start}; refusing to use the body as a "
758+
f"range fetch."
759+
)
760+
if len(data) < min(length, 1):
761+
# Empty body but caller wanted bytes.
762+
raise OSError("HTTP 200 response body is empty.")
763+
return
764+
# Parse ``bytes <start>-<end>/<total-or-*>``. Reject anything
765+
# that does not start at the requested offset; allow ``end`` to
766+
# be lower than requested when EOF was hit.
767+
try:
768+
unit, _, spec = content_range.partition(' ')
769+
rng, _, _total = spec.partition('/')
770+
cr_start_s, _, cr_end_s = rng.partition('-')
771+
cr_start = int(cr_start_s)
772+
cr_end = int(cr_end_s)
773+
except ValueError:
774+
raise OSError(
775+
f"HTTP Content-Range header {content_range!r} could "
776+
f"not be parsed."
777+
) from None
778+
if unit != 'bytes':
779+
raise OSError(
780+
f"HTTP Content-Range unit {unit!r} is not 'bytes'."
781+
)
782+
if cr_start != start:
783+
raise OSError(
784+
f"HTTP Content-Range {content_range!r} starts at byte "
785+
f"{cr_start}, but the request started at byte {start}."
786+
)
787+
if cr_end < cr_start:
788+
raise OSError(
789+
f"HTTP Content-Range {content_range!r} has end "
790+
f"({cr_end}) below start ({cr_start})."
791+
)
792+
if cr_end - cr_start + 1 > length:
793+
raise OSError(
794+
f"HTTP Content-Range {content_range!r} advertises more "
795+
f"bytes than were requested (length={length})."
796+
)
797+
expected_len = cr_end - cr_start + 1
798+
if len(data) != expected_len:
799+
raise OSError(
800+
f"HTTP range body length {len(data)} does not match "
801+
f"the {expected_len} bytes advertised by "
802+
f"Content-Range {content_range!r}."
803+
)
695804

696805
def read_ranges(
697806
self,
Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
"""Regression tests for issue #1735.
2+
3+
``_HTTPSource.read_range`` previously returned the response body
4+
without checking the status code, the ``Content-Range`` header, or the
5+
returned byte length. Three failure modes slipped through silently:
6+
7+
- a 200 (Range ignored) or a 4xx/5xx body was handed to the caller as
8+
if it were the requested range,
9+
- a ``Content-Range`` header pointing at a different byte range was
10+
trusted as the requested one,
11+
- a truncated response was passed to a downstream codec where the
12+
decode error appeared far from the real cause.
13+
14+
These tests stand up tiny loopback HTTP servers that misbehave in each
15+
of those ways and assert that ``read_range`` raises a clear ``OSError``.
16+
"""
17+
from __future__ import annotations
18+
19+
import http.server
20+
import socketserver
21+
import threading
22+
23+
import pytest
24+
25+
from xrspatial.geotiff._reader import _HTTPSource
26+
27+
28+
class _BaseHandler(http.server.BaseHTTPRequestHandler):
29+
payload: bytes = b'0' * 64
30+
31+
def log_message(self, *_args, **_kwargs):
32+
pass
33+
34+
35+
def _serve(handler_cls):
36+
httpd = socketserver.TCPServer(('127.0.0.1', 0), handler_cls)
37+
port = httpd.server_address[1]
38+
thread = threading.Thread(target=httpd.serve_forever, daemon=True)
39+
thread.start()
40+
return f'http://127.0.0.1:{port}/x.bin', httpd, thread
41+
42+
43+
def _stop(httpd):
44+
httpd.shutdown()
45+
httpd.server_close()
46+
47+
48+
@pytest.fixture(autouse=True)
49+
def _allow_loopback(monkeypatch):
50+
monkeypatch.setenv('XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS', '1')
51+
52+
53+
def test_range_request_ignored_for_nonzero_start_raises():
54+
"""Server ignores ``Range`` for a non-zero start and returns full
55+
200 -> OSError. (A 200 with start=0 is harmless because the body
56+
offsets line up with what the caller wanted.)"""
57+
58+
class _Handler(_BaseHandler):
59+
def do_GET(self): # noqa: N802
60+
# Ignore Range header; return the full object as 200.
61+
self.send_response(200)
62+
self.send_header('Content-Length', str(len(self.payload)))
63+
self.end_headers()
64+
self.wfile.write(self.payload)
65+
66+
url, httpd, _ = _serve(_Handler)
67+
try:
68+
src = _HTTPSource(url)
69+
with pytest.raises(OSError, match="Content-Range|range fetch"):
70+
src.read_range(8, 16)
71+
finally:
72+
_stop(httpd)
73+
74+
75+
def test_range_request_wrong_content_range_raises():
76+
"""Server returns 206 but the Content-Range header points at the
77+
wrong bytes -> OSError."""
78+
79+
class _Handler(_BaseHandler):
80+
def do_GET(self): # noqa: N802
81+
# Pretend we sent bytes 4-19/64 regardless of what was asked.
82+
self.send_response(206)
83+
self.send_header('Content-Length', '16')
84+
self.send_header('Content-Range', 'bytes 4-19/64')
85+
self.end_headers()
86+
self.wfile.write(self.payload[4:20])
87+
88+
url, httpd, _ = _serve(_Handler)
89+
try:
90+
src = _HTTPSource(url)
91+
# Caller asks for 0-15; server says 4-19.
92+
with pytest.raises(OSError, match="Content-Range"):
93+
src.read_range(0, 16)
94+
finally:
95+
_stop(httpd)
96+
97+
98+
def test_range_request_short_body_raises():
99+
"""Server returns 206 with a body shorter than the requested
100+
length -> OSError."""
101+
102+
class _Handler(_BaseHandler):
103+
def do_GET(self): # noqa: N802
104+
self.send_response(206)
105+
self.send_header('Content-Length', '4')
106+
self.send_header('Content-Range', 'bytes 0-15/64')
107+
self.end_headers()
108+
# Send only 4 bytes despite advertising a 16-byte range.
109+
self.wfile.write(self.payload[:4])
110+
111+
url, httpd, _ = _serve(_Handler)
112+
try:
113+
src = _HTTPSource(url)
114+
with pytest.raises(OSError, match="length"):
115+
src.read_range(0, 16)
116+
finally:
117+
_stop(httpd)
118+
119+
120+
def test_range_request_well_formed_succeeds():
121+
"""A correctly-formed 206 response is accepted as-is."""
122+
123+
class _Handler(_BaseHandler):
124+
def do_GET(self): # noqa: N802
125+
rng = self.headers.get('Range', '')
126+
spec = rng[len('bytes='):]
127+
s, _, e = spec.partition('-')
128+
start = int(s)
129+
end = int(e)
130+
chunk = self.payload[start:end + 1]
131+
self.send_response(206)
132+
self.send_header('Content-Length', str(len(chunk)))
133+
self.send_header(
134+
'Content-Range',
135+
f'bytes {start}-{start + len(chunk) - 1}/'
136+
f'{len(self.payload)}',
137+
)
138+
self.end_headers()
139+
self.wfile.write(chunk)
140+
141+
url, httpd, _ = _serve(_Handler)
142+
try:
143+
src = _HTTPSource(url)
144+
out = src.read_range(8, 16)
145+
assert out == _BaseHandler.payload[8:24]
146+
assert len(out) == 16
147+
finally:
148+
_stop(httpd)

0 commit comments

Comments
 (0)