feat: Add IPv6 zone ID support in HTTPAdapter

tboy1337 · tboy1337 · commit d19ecbf10ae4 · 2026-04-13T13:28:55.000+07:00
- Add detection and parsing of IPv6 zone IDs in HTTPAdapter
- Correctly handle URL scheme for IPv6 zone IDs
- Update regex patterns for IPv6 zone ID handling
- Remove unnecessary blank line in _urllib3_request_context

Made-with: Cursor
diff --git a/src/requests/adapters.py b/src/requests/adapters.py
@@ -7,6 +7,7 @@
 """
 
 import os.path
+import re
 import socket  # noqa: F401
 import typing
 import warnings
@@ -73,6 +74,45 @@ def SOCKSProxyManager(*args, **kwargs):
 DEFAULT_RETRIES = 0
 DEFAULT_POOL_TIMEOUT = None
 
+# Anchored to the authority section of the URL (between "://" and the first
+# "/", "?", or "#") so that brackets in the path or query string cannot
+# produce false positives.
+#
+# Inside the brackets two forms are detected:
+#   - RFC 6874 encoded %25: the delimiter is %25 followed by one or more
+#     ZoneID characters. Per RFC 6874 the ZoneID unreserved chars are
+#     [A-Za-z0-9_.\-~] plus percent-encoded octets (%[0-9A-Fa-f]{2}), so
+#     names like "Ethernet%203" (space encoded as %20) or names containing
+#     tildes are matched correctly.
+#   - Literal %: a negative lookahead (?![0-9A-Fa-f]{2}) rejects valid
+#     percent-encoded bytes whose first hex digit happens to be a letter
+#     (e.g. %AB, %aF, %CD). After that guard, one alphanumeric character
+#     is required (covering both named interfaces like eth0 and numeric
+#     zone indices like 1 or 3), followed by zero or more identifier chars.
+_IPV6_ZONE_ID_RE = re.compile(
+    r"://[^/?#]*\[[^\]]*"
+    r"(?:%25(?:[a-zA-Z0-9_.\-~]|%[0-9A-Fa-f]{2})+"
+    r"|%(?![0-9A-Fa-f]{2})[0-9A-Za-z][A-Za-z0-9_.\-]*)\]"
+)
+
+
+def _has_ipv6_zone_id(url: str) -> bool:
+    """
+    Detect if URL contains IPv6 zone identifier (scope ID).
+
+    IPv6 zone IDs use % character within brackets, e.g.:
+    http://[fe80::1%eth0]:8080/
+
+    This is used to determine whether to use urllib3's parse_url()
+    (which handles zone IDs correctly) or urlparse() for backward
+    compatibility.
+
+    :param url: URL string to check
+    :return: True if URL contains IPv6 zone ID
+    :rtype: bool
+    """
+    return bool(_IPV6_ZONE_ID_RE.search(url))
+
 
 def _urllib3_request_context(
     request: "PreparedRequest",
@@ -82,9 +122,21 @@ def _urllib3_request_context(
 ) -> "(dict[str, typing.Any], dict[str, typing.Any])":
     host_params = {}
     pool_kwargs = {}
-    parsed_request_url = urlparse(request.url)
-    scheme = parsed_request_url.scheme.lower()
-    port = parsed_request_url.port
+
+    # Use urllib3's parse_url for IPv6 zone IDs, urlparse otherwise
+    if _has_ipv6_zone_id(request.url):
+        parsed_request_url = parse_url(request.url)
+        scheme = parsed_request_url.scheme.lower()
+        port = parsed_request_url.port
+        # parse_url uses .host and includes brackets for IPv6, strip them
+        hostname = parsed_request_url.host
+        if hostname and hostname.startswith("[") and hostname.endswith("]"):
+            hostname = hostname[1:-1]
+    else:
+        parsed_request_url = urlparse(request.url)
+        scheme = parsed_request_url.scheme.lower()
+        port = parsed_request_url.port
+        hostname = parsed_request_url.hostname  # urlparse uses .hostname
 
     cert_reqs = "CERT_REQUIRED"
     if verify is False:
@@ -105,7 +157,7 @@ def _urllib3_request_context(
             pool_kwargs["cert_file"] = client_cert
     host_params = {
         "scheme": scheme,
-        "host": parsed_request_url.hostname,
+        "host": hostname,
         "port": port,
     }
     return host_params, pool_kwargs
@@ -536,7 +588,10 @@ def request_url(self, request, proxies):
         :rtype: str
         """
         proxy = select_proxy(request.url, proxies)
-        scheme = urlparse(request.url).scheme
+        if _has_ipv6_zone_id(request.url):
+            scheme = parse_url(request.url).scheme
+        else:
+            scheme = urlparse(request.url).scheme
 
         is_proxied_http_request = proxy and scheme != "https"
         using_socks_proxy = False
diff --git a/src/requests/models.py b/src/requests/models.py
@@ -11,6 +11,7 @@
 # Implicit import within threads may cause LookupError when standard library is in a ZIP,
 # such as in Embedded Python. See https://github.com/psf/requests/issues/3578.
 import encodings.idna  # noqa: F401
+import re
 from io import UnsupportedOperation
 
 from urllib3.exceptions import (
@@ -82,6 +83,14 @@
 CONTENT_CHUNK_SIZE = 10 * 1024
 ITER_CHUNK_SIZE = 512
 
+# Regex patterns for IPv6 zone ID handling in prepare_url.
+# Extracts the bracket content from the authority section of the URL.
+_AUTHORITY_BRACKET_RE = re.compile(r"://[^/?#]*\[([^\]]*)\]")
+# Matches an RFC 6874 zone ID delimiter (%25) followed by zone ID characters.
+_RFC6874_ZONE_ID_RE = re.compile(r"%25(?:[a-zA-Z0-9_.\-~]|%[0-9A-Fa-f]{2})+")
+# Matches a raw % zone ID delimiter (not a valid percent-encoded byte).
+_RAW_ZONE_ID_RE = re.compile(r"%(?![0-9A-Fa-f]{2})[0-9A-Za-z][A-Za-z0-9_.\-]*")
+
 
 class RequestEncodingMixin:
     @property
@@ -436,6 +445,41 @@ def prepare_url(self, url, params):
         except LocationParseError as e:
             raise InvalidURL(*e.args)
 
+        # Mitigation for RFC 6874: parse_url incorrectly decodes zone ID delimiter (%25 -> %)
+        # We reconstruct the host with the correct, fully-encoded delimiter to prevent
+        # downstream errors (like ipaddress validation or incorrect connection arguments).
+        #
+        # Matching on the parse_url-decoded host is ambiguous because parse_url decodes
+        # %25 -> % and then the resulting %XX may look like a valid percent-encoding
+        # (e.g. %2550 becomes %50 which resembles percent-encoded 'P'). Instead we
+        # extract the bracket content from the ORIGINAL url (before any decoding) and
+        # match there. Two input forms are handled:
+        #
+        #   1. RFC 6874 encoded form (%25 delimiter): the original bracket contains %25
+        #      followed by one or more ZoneID unreserved chars ([A-Za-z0-9_.\-~]) or
+        #      pct-encoded octets (%XX). Examples: [fe80::1%25eth0], [fe80::1%255],
+        #      [fe80::1%25_foo]. The matched segment is placed verbatim into host.
+        #
+        #   2. Raw % delimiter (legacy/non-standard): a literal % that is NOT a valid
+        #      %XX percent-encoding, followed by a letter then more identifier chars.
+        #      Examples: [fe80::1%eth0], [fe80::1%wlan0]. Re-encoded as %25<zone_name>.
+        #
+        # This avoids false-positive re-encoding of legitimate %XX sequences (e.g. %20,
+        # %AB) that should never be treated as zone ID delimiters.
+        if host and host.startswith("[") and host.endswith("]"):
+            original_bracket = _AUTHORITY_BRACKET_RE.search(url)
+            if original_bracket:
+                original_inner = original_bracket.group(1)
+                rfc_match = _RFC6874_ZONE_ID_RE.search(original_inner)
+                if rfc_match:
+                    ip_part = original_inner[: rfc_match.start()]
+                    host = f"[{ip_part}{rfc_match.group()}]"
+                else:
+                    raw_match = _RAW_ZONE_ID_RE.search(original_inner)
+                    if raw_match:
+                        pos = raw_match.start()
+                        host = f"[{original_inner[:pos]}%25{original_inner[pos + 1 :]}]"
+
         if not scheme:
             raise MissingSchema(
                 f"Invalid URL {url!r}: No scheme supplied. "
diff --git a/tests/test_adapters.py b/tests/test_adapters.py