Skip to content

Commit d19ecbf

Browse files
committed
feat: Add IPv6 zone ID support in HTTPAdapter
- Add detection and parsing of IPv6 zone IDs in HTTPAdapter - Correctly handle URL scheme for IPv6 zone IDs - Update regex patterns for IPv6 zone ID handling - Remove unnecessary blank line in _urllib3_request_context Made-with: Cursor
1 parent cbce031 commit d19ecbf

3 files changed

Lines changed: 533 additions & 5 deletions

File tree

src/requests/adapters.py

Lines changed: 60 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
"""
88

99
import os.path
10+
import re
1011
import socket # noqa: F401
1112
import typing
1213
import warnings
@@ -73,6 +74,45 @@ def SOCKSProxyManager(*args, **kwargs):
7374
DEFAULT_RETRIES = 0
7475
DEFAULT_POOL_TIMEOUT = None
7576

77+
# Anchored to the authority section of the URL (between "://" and the first
78+
# "/", "?", or "#") so that brackets in the path or query string cannot
79+
# produce false positives.
80+
#
81+
# Inside the brackets two forms are detected:
82+
# - RFC 6874 encoded %25: the delimiter is %25 followed by one or more
83+
# ZoneID characters. Per RFC 6874 the ZoneID unreserved chars are
84+
# [A-Za-z0-9_.\-~] plus percent-encoded octets (%[0-9A-Fa-f]{2}), so
85+
# names like "Ethernet%203" (space encoded as %20) or names containing
86+
# tildes are matched correctly.
87+
# - Literal %: a negative lookahead (?![0-9A-Fa-f]{2}) rejects valid
88+
# percent-encoded bytes whose first hex digit happens to be a letter
89+
# (e.g. %AB, %aF, %CD). After that guard, one alphanumeric character
90+
# is required (covering both named interfaces like eth0 and numeric
91+
# zone indices like 1 or 3), followed by zero or more identifier chars.
92+
_IPV6_ZONE_ID_RE = re.compile(
93+
r"://[^/?#]*\[[^\]]*"
94+
r"(?:%25(?:[a-zA-Z0-9_.\-~]|%[0-9A-Fa-f]{2})+"
95+
r"|%(?![0-9A-Fa-f]{2})[0-9A-Za-z][A-Za-z0-9_.\-]*)\]"
96+
)
97+
98+
99+
def _has_ipv6_zone_id(url: str) -> bool:
100+
"""
101+
Detect if URL contains IPv6 zone identifier (scope ID).
102+
103+
IPv6 zone IDs use % character within brackets, e.g.:
104+
http://[fe80::1%eth0]:8080/
105+
106+
This is used to determine whether to use urllib3's parse_url()
107+
(which handles zone IDs correctly) or urlparse() for backward
108+
compatibility.
109+
110+
:param url: URL string to check
111+
:return: True if URL contains IPv6 zone ID
112+
:rtype: bool
113+
"""
114+
return bool(_IPV6_ZONE_ID_RE.search(url))
115+
76116

77117
def _urllib3_request_context(
78118
request: "PreparedRequest",
@@ -82,9 +122,21 @@ def _urllib3_request_context(
82122
) -> "(dict[str, typing.Any], dict[str, typing.Any])":
83123
host_params = {}
84124
pool_kwargs = {}
85-
parsed_request_url = urlparse(request.url)
86-
scheme = parsed_request_url.scheme.lower()
87-
port = parsed_request_url.port
125+
126+
# Use urllib3's parse_url for IPv6 zone IDs, urlparse otherwise
127+
if _has_ipv6_zone_id(request.url):
128+
parsed_request_url = parse_url(request.url)
129+
scheme = parsed_request_url.scheme.lower()
130+
port = parsed_request_url.port
131+
# parse_url uses .host and includes brackets for IPv6, strip them
132+
hostname = parsed_request_url.host
133+
if hostname and hostname.startswith("[") and hostname.endswith("]"):
134+
hostname = hostname[1:-1]
135+
else:
136+
parsed_request_url = urlparse(request.url)
137+
scheme = parsed_request_url.scheme.lower()
138+
port = parsed_request_url.port
139+
hostname = parsed_request_url.hostname # urlparse uses .hostname
88140

89141
cert_reqs = "CERT_REQUIRED"
90142
if verify is False:
@@ -105,7 +157,7 @@ def _urllib3_request_context(
105157
pool_kwargs["cert_file"] = client_cert
106158
host_params = {
107159
"scheme": scheme,
108-
"host": parsed_request_url.hostname,
160+
"host": hostname,
109161
"port": port,
110162
}
111163
return host_params, pool_kwargs
@@ -536,7 +588,10 @@ def request_url(self, request, proxies):
536588
:rtype: str
537589
"""
538590
proxy = select_proxy(request.url, proxies)
539-
scheme = urlparse(request.url).scheme
591+
if _has_ipv6_zone_id(request.url):
592+
scheme = parse_url(request.url).scheme
593+
else:
594+
scheme = urlparse(request.url).scheme
540595

541596
is_proxied_http_request = proxy and scheme != "https"
542597
using_socks_proxy = False

src/requests/models.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
# Implicit import within threads may cause LookupError when standard library is in a ZIP,
1212
# such as in Embedded Python. See https://github.com/psf/requests/issues/3578.
1313
import encodings.idna # noqa: F401
14+
import re
1415
from io import UnsupportedOperation
1516

1617
from urllib3.exceptions import (
@@ -82,6 +83,14 @@
8283
CONTENT_CHUNK_SIZE = 10 * 1024
8384
ITER_CHUNK_SIZE = 512
8485

86+
# Regex patterns for IPv6 zone ID handling in prepare_url.
87+
# Extracts the bracket content from the authority section of the URL.
88+
_AUTHORITY_BRACKET_RE = re.compile(r"://[^/?#]*\[([^\]]*)\]")
89+
# Matches an RFC 6874 zone ID delimiter (%25) followed by zone ID characters.
90+
_RFC6874_ZONE_ID_RE = re.compile(r"%25(?:[a-zA-Z0-9_.\-~]|%[0-9A-Fa-f]{2})+")
91+
# Matches a raw % zone ID delimiter (not a valid percent-encoded byte).
92+
_RAW_ZONE_ID_RE = re.compile(r"%(?![0-9A-Fa-f]{2})[0-9A-Za-z][A-Za-z0-9_.\-]*")
93+
8594

8695
class RequestEncodingMixin:
8796
@property
@@ -436,6 +445,41 @@ def prepare_url(self, url, params):
436445
except LocationParseError as e:
437446
raise InvalidURL(*e.args)
438447

448+
# Mitigation for RFC 6874: parse_url incorrectly decodes zone ID delimiter (%25 -> %)
449+
# We reconstruct the host with the correct, fully-encoded delimiter to prevent
450+
# downstream errors (like ipaddress validation or incorrect connection arguments).
451+
#
452+
# Matching on the parse_url-decoded host is ambiguous because parse_url decodes
453+
# %25 -> % and then the resulting %XX may look like a valid percent-encoding
454+
# (e.g. %2550 becomes %50 which resembles percent-encoded 'P'). Instead we
455+
# extract the bracket content from the ORIGINAL url (before any decoding) and
456+
# match there. Two input forms are handled:
457+
#
458+
# 1. RFC 6874 encoded form (%25 delimiter): the original bracket contains %25
459+
# followed by one or more ZoneID unreserved chars ([A-Za-z0-9_.\-~]) or
460+
# pct-encoded octets (%XX). Examples: [fe80::1%25eth0], [fe80::1%255],
461+
# [fe80::1%25_foo]. The matched segment is placed verbatim into host.
462+
#
463+
# 2. Raw % delimiter (legacy/non-standard): a literal % that is NOT a valid
464+
# %XX percent-encoding, followed by a letter then more identifier chars.
465+
# Examples: [fe80::1%eth0], [fe80::1%wlan0]. Re-encoded as %25<zone_name>.
466+
#
467+
# This avoids false-positive re-encoding of legitimate %XX sequences (e.g. %20,
468+
# %AB) that should never be treated as zone ID delimiters.
469+
if host and host.startswith("[") and host.endswith("]"):
470+
original_bracket = _AUTHORITY_BRACKET_RE.search(url)
471+
if original_bracket:
472+
original_inner = original_bracket.group(1)
473+
rfc_match = _RFC6874_ZONE_ID_RE.search(original_inner)
474+
if rfc_match:
475+
ip_part = original_inner[: rfc_match.start()]
476+
host = f"[{ip_part}{rfc_match.group()}]"
477+
else:
478+
raw_match = _RAW_ZONE_ID_RE.search(original_inner)
479+
if raw_match:
480+
pos = raw_match.start()
481+
host = f"[{original_inner[:pos]}%25{original_inner[pos + 1 :]}]"
482+
439483
if not scheme:
440484
raise MissingSchema(
441485
f"Invalid URL {url!r}: No scheme supplied. "

0 commit comments

Comments
 (0)