bazel-contrib · aignas · Mar 8, 2026 · Mar 1, 2026 · Mar 8, 2026 · Mar 8, 2026
@@ -418,13 +418,19 @@ bzl_library(
     srcs = ["simpleapi_download.bzl"],
     deps = [
         ":parse_simpleapi_html_bzl",
+        ":urllib_bzl",
         "//python/private:auth_bzl",
         "//python/private:normalize_name_bzl",
         "//python/private:text_util_bzl",
         "@bazel_features//:features",
     ],
 )
 
+bzl_library(
+    name = "urllib_bzl",
+    srcs = ["urllib.bzl"],
+)
+
 bzl_library(
     name = "version_from_filename_bzl",
     srcs = ["version_from_filename.bzl"],
@@ -474,6 +480,7 @@ bzl_library(
         ":patch_whl_bzl",
         ":pep508_requirement_bzl",
         ":pypi_repo_utils_bzl",
+        ":urllib_bzl",
         ":whl_extract_bzl",
         ":whl_metadata_bzl",
         ":whl_target_platforms_bzl",

@@ -599,6 +599,7 @@ def _create_whl_repos(
         for src in whl.srcs:
             repo = _whl_repo(
                 src = src,
+                index_url = whl.index_url,
                 whl_library_args = whl_library_args,
                 download_only = pip_attr.download_only,
                 netrc = self._config.netrc or pip_attr.netrc,
@@ -678,6 +679,7 @@ def _whl_repo(
         *,
         src,
         whl_library_args,
+        index_url,
         is_multiple_versions,
         download_only,
         netrc,
@@ -731,6 +733,8 @@ def _whl_repo(
         args["netrc"] = netrc
     if auth_patterns:
         args["auth_patterns"] = auth_patterns
+    if index_url:
+        args["index_url"] = index_url
 
     args["urls"] = [src.url]
     args["sha256"] = src.sha256

@@ -188,10 +188,11 @@ def parse_requirements(
             for p in r.target_platforms:
                 requirement_target_platforms[p] = None
 
+        pkg_sources = index_urls.get(name)
         package_srcs = _package_srcs(
             name = name,
             reqs = reqs,
-            index_urls = index_urls,
+            pkg_sources = pkg_sources,
             platforms = platforms,
             extract_url_srcs = extract_url_srcs,
             logger = logger,
@@ -216,6 +217,7 @@ def parse_requirements(
             name = normalize_name(name),
             is_exposed = len(requirement_target_platforms) == len(requirements),
             is_multiple_versions = len(reqs.values()) > 1,
+            index_url = pkg_sources.index_url if pkg_sources else "",
             srcs = package_srcs,
         )
         ret.append(item)
@@ -234,7 +236,7 @@ def _package_srcs(
         *,
         name,
         reqs,
-        index_urls,
+        pkg_sources,
         platforms,
         logger,
         extract_url_srcs):
@@ -253,7 +255,7 @@ def _package_srcs(
             dist, can_fallback = _add_dists(
                 requirement = r,
                 target_platform = platforms.get(target_platform),
-                index_urls = index_urls.get(name),
+                index_urls = pkg_sources,
                 logger = logger,
             )
             logger.debug(lambda: "The whl dist is: {}".format(dist.filename if dist else dist))

@@ -18,11 +18,10 @@ Parse SimpleAPI HTML in Starlark.
 
 load(":version_from_filename.bzl", "version_from_filename")
 
-def parse_simpleapi_html(*, url, content):
+def parse_simpleapi_html(*, content):
     """Get the package URLs for given shas by parsing the Simple API HTML.
 
     Args:
-        url(str): The URL that the HTML content can be downloaded from.
         content(str): The Simple API HTML content.
 
     Returns:
@@ -57,7 +56,6 @@ def parse_simpleapi_html(*, url, content):
     sha256s_by_version = {}
     for line in lines[1:]:
         dist_url, _, tail = line.partition("#sha256=")
-        dist_url = _absolute_url(url, dist_url)
 
         sha256, _, tail = tail.partition("\"")
 
@@ -87,7 +85,7 @@ def parse_simpleapi_html(*, url, content):
                 url = dist_url,
                 sha256 = sha256,
                 metadata_sha256 = metadata_sha256,
-                metadata_url = _absolute_url(url, metadata_url) if metadata_url else "",
+                metadata_url = metadata_url,
                 yanked = yanked,
             )
         else:
@@ -106,47 +104,3 @@ def parse_simpleapi_html(*, url, content):
         whls = whls,
         sha256s_by_version = sha256s_by_version,
     )
-
-def _get_root_directory(url):
-    scheme_end = url.find("://")
-    if scheme_end == -1:
-        fail("Invalid URL format")
-
-    scheme = url[:scheme_end]
-    host_end = url.find("/", scheme_end + 3)
-    if host_end == -1:
-        host_end = len(url)
-    host = url[scheme_end + 3:host_end]
-
-    return "{}://{}".format(scheme, host)
-
-def _is_downloadable(url):
-    """Checks if the URL would be accepted by the Bazel downloader.
-
-    This is based on Bazel's HttpUtils::isUrlSupportedByDownloader
-    """
-    return url.startswith("http://") or url.startswith("https://") or url.startswith("file://")
-
-def _absolute_url(index_url, candidate):
-    if candidate == "":
-        return candidate
-
-    if _is_downloadable(candidate):
-        return candidate
-
-    if candidate.startswith("/"):
-        # absolute path
-        root_directory = _get_root_directory(index_url)
-        return "{}{}".format(root_directory, candidate)
-
-    if candidate.startswith(".."):
-        # relative path with up references
-        candidate_parts = candidate.split("..")
-        last = candidate_parts[-1]
-        for _ in range(len(candidate_parts) - 1):
-            index_url, _, _ = index_url.rstrip("/").rpartition("/")
-
-        return "{}/{}".format(index_url, last.strip("/"))
-
-    # relative path without up-references
-    return "{}/{}".format(index_url.rstrip("/"), candidate)
@@ -9,7 +9,11 @@ In the future the same will be used to:
 """
 
 def pypi_cache(store = None):
-    """The cache for PyPI index queries."""
+    """The cache for PyPI index queries.
+
+    Currently the key is of the following structure:
+    (url, real_url)
+    """
 
     # buildifier: disable=uninitialized
     self = struct(
@@ -29,6 +33,10 @@ def _pypi_cache_setdefault(self, key, parsed_result):
         key: {type}`str` The cache key, can be any string.
         parsed_result: {type}`struct` The result of `parse_simpleapi_html` function.
 
+    index_url and distribution is used to write to the MODULE.bazel.lock file as facts
+    real_index_url and distribution is used to write to in-memory cache to ensure that there are
+    no duplicate calls to the PyPI indexes
+
     Returns:
         The `parse_result`.
     """

@@ -22,6 +22,7 @@ load("//python/private:envsubst.bzl", "envsubst")
 load("//python/private:normalize_name.bzl", "normalize_name")
 load("//python/private:text_util.bzl", "render")
 load(":parse_simpleapi_html.bzl", "parse_simpleapi_html")
+load(":urllib.bzl", "urllib")
 
 def simpleapi_download(
         ctx,
@@ -92,13 +93,14 @@ def simpleapi_download(
         sources = [pkg for pkg in attr.sources if pkg not in found_on_index]
         for pkg in sources:
             pkg_normalized = normalize_name(pkg)
+            url = urllib.strip_empty_path_segments("{index_url}/{distribution}/".format(
+                index_url = index_url_overrides.get(pkg_normalized, index_url).rstrip("/"),
+                distribution = pkg,
+            ))
             result = read_simpleapi(
                 ctx = ctx,
-                url = "{}/{}/".format(
-                    index_url_overrides.get(pkg_normalized, index_url).rstrip("/"),
-                    pkg,
-                ),
                 attr = attr,
+                url = url,
                 cache = cache,
                 get_auth = get_auth,
                 **download_kwargs
@@ -108,9 +110,10 @@ def simpleapi_download(
                 async_downloads[pkg] = struct(
                     pkg_normalized = pkg_normalized,
                     wait = result.wait,
+                    url = url,
                 )
             elif result.success:
-                contents[pkg_normalized] = result.output
+                contents[pkg_normalized] = _with_index_url(url, result.output)
                 found_on_index[pkg] = index_url
 
         if not async_downloads:
@@ -122,7 +125,7 @@ def simpleapi_download(
             result = download.wait()
 
             if result.success:
-                contents[download.pkg_normalized] = result.output
+                contents[download.pkg_normalized] = _with_index_url(download.url, result.output)
                 found_on_index[pkg] = index_url
 
     failed_sources = [pkg for pkg in attr.sources if pkg not in found_on_index]
@@ -168,14 +171,14 @@ def _read_simpleapi(ctx, url, attr, cache, get_auth = None, **download_kwargs):
 
     Args:
         ctx: The module_ctx or repository_ctx.
-        url: str, the url parameter that can be passed to ctx.download.
+        url: {type}`str`, the url parameter that can be passed to ctx.download.
         attr: The attribute that contains necessary info for downloading. The
           following attributes must be present:
-           * envsubst: The envsubst values for performing substitutions in the URL.
-           * netrc: The netrc parameter for ctx.download, see http_file for docs.
+           * envsubst: {type}`dict[str, str]` for performing substitutions in the URL.
+           * netrc: The netrc parameter for ctx.download, see {obj}`http_file` for docs.
            * auth_patterns: The auth_patterns parameter for ctx.download, see
-               http_file for docs.
-        cache: A dict for storing the results.
+               {obj}`http_file` for docs.
+        cache: {type}`struct` the `pypi_cache` instance.
         get_auth: A function to get auth information. Used in tests.
         **download_kwargs: Any extra params to ctx.download.
             Note that output and auth will be passed for you.
@@ -189,9 +192,9 @@ def _read_simpleapi(ctx, url, attr, cache, get_auth = None, **download_kwargs):
     # them to ctx.download if we want to correctly handle the relative URLs.
     # TODO: Add a test that env subbed index urls do not leak into the lock file.
 
-    real_url = strip_empty_path_segments(envsubst(url, attr.envsubst, ctx.getenv))
+    real_url = urllib.strip_empty_path_segments(envsubst(url, attr.envsubst, ctx.getenv))
 
-    cache_key = real_url
+    cache_key = (url, real_url)
     cached_result = cache.get(cache_key)
     if cached_result:
         return struct(success = True, output = cached_result)
@@ -225,41 +228,43 @@ def _read_simpleapi(ctx, url, attr, cache, get_auth = None, **download_kwargs):
     if download_kwargs.get("block") == False:
         # Simulate the same API as ctx.download has
         return struct(
-            wait = lambda: _read_index_result(ctx, download.wait(), output, real_url, cache, cache_key),
+            wait = lambda: _read_index_result(
+                ctx,
+                result = download.wait(),
+                output = output,
+                cache = cache,
+                cache_key = cache_key,
+            ),
         )
 
-    return _read_index_result(ctx, download, output, real_url, cache, cache_key)
-
-def strip_empty_path_segments(url):
-    """Removes empty path segments from a URL. Does nothing for urls with no scheme.
-
-    Public only for testing.
-
-    Args:
-        url: The url to remove empty path segments from
-
-    Returns:
-        The url with empty path segments removed and any trailing slash preserved.
-        If the url had no scheme it is returned unchanged.
-    """
-    scheme, _, rest = url.partition("://")
-    if rest == "":
-        return url
-    stripped = "/".join([p for p in rest.split("/") if p])
-    if url.endswith("/"):
-        return "{}://{}/".format(scheme, stripped)
-    else:
-        return "{}://{}".format(scheme, stripped)
+    return _read_index_result(
+        ctx,
+        result = download,
+        output = output,
+        cache = cache,
+        cache_key = cache_key,
+    )
 
-def _read_index_result(ctx, result, output, url, cache, cache_key):
+def _read_index_result(ctx, *, result, output, cache, cache_key):
     if not result.success:
         return struct(success = False)
 
     content = ctx.read(output)
 
-    output = parse_simpleapi_html(url = url, content = content)
+    output = parse_simpleapi_html(content = content)
     if output:
         cache.setdefault(cache_key, output)
-        return struct(success = True, output = output, cache_key = cache_key)
+        return struct(success = True, output = output)
     else:
         return struct(success = False)
+
+def _with_index_url(index_url, values):
+    if not values:
+        return values
+
+    return struct(
+        sdists = values.sdists,
+        whls = values.whls,
+        sha256s_by_version = values.sha256s_by_version,
+        index_url = index_url,
+    )