bazel-contrib
diff --git a/‎CHANGELOG.md‎
Lines changed: 12 additions & 6 deletions b/‎CHANGELOG.md‎
Lines changed: 12 additions & 6 deletions
diff --git a/‎python/private/pypi/BUILD.bazel‎
Lines changed: 1 addition & 2 deletions b/‎python/private/pypi/BUILD.bazel‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎python/private/pypi/parse_simpleapi_html.bzl‎
Lines changed: 28 additions & 9 deletions b/‎python/private/pypi/parse_simpleapi_html.bzl‎
Lines changed: 28 additions & 9 deletions
diff --git a/‎python/private/pypi/pypi_cache.bzl‎
Lines changed: 54 additions & 0 deletions b/‎python/private/pypi/pypi_cache.bzl‎
Lines changed: 54 additions & 0 deletions
@@ -69,12 +69,18 @@ END_UNRELEASED_TEMPLATE
 Other changes:
 * (pypi) Update dependencies used for `compile_pip_requirements`, building
   sdists in the `whl_library` rule and fetching wheels using `pip`.
-* (pypi) We will set `allow_fail` to `False` if the
-  {attr}`experimental_index_url_overrides` is set
-  to a non-empty value. This means that failures will be no-longer cached in
-  this particular case.
-  ([#3260](https://github.com/bazel-contrib/rules_python/issues/3260) and
-  [#2632](https://github.com/bazel-contrib/rules_python/issues/2632))
+* (pypi) Before using the bazel downloader to fetch the PyPI package metadata
+  we will from now on fetch the lists of available packages on each index. The
+  used package mappings will be written as facts to the `MODULE.bazel.lock` file
+  on supported bazel versions and it should be done at most once. As a result,
+  per-package {obj}`experimental_index_url_overrides` is no longer needed if the index URLs are
+  passed to the `pip.parse` via `experimental_index_url` and `experimental_extra_index_urls`.
+  What is more, we start implementing the flags for `--index_url` and `--extra_index_urls` more in
+  line to how it is used in `uv` and `pip`, i.e. we default to `--index_url` if the package is not
+  found in `--extra_index_urls`.
+  Fixes
+  ([#3260](https://github.com/bazel-contrib/rules_python/issues/3260) and 
+  [#2632](https://github.com/bazel-contrib/rules_python/issues/2632)).
 
 {#v0-0-0-fixed}
 ### Fixed
 
@@ -244,6 +244,7 @@ bzl_library(
     srcs = ["parse_simpleapi_html.bzl"],
     deps = [
         ":version_from_filename_bzl",
+        "//python/private:normalize_name_bzl",
     ],
 )
 
@@ -424,8 +425,6 @@ bzl_library(
         ":urllib_bzl",
         "//python/private:auth_bzl",
         "//python/private:normalize_name_bzl",
-        "//python/private:text_util_bzl",
-        "@bazel_features//:features",
     ],
 )
 
 
@@ -16,16 +16,20 @@
 Parse SimpleAPI HTML in Starlark.
 """
 
+load("//python/private:normalize_name.bzl", "normalize_name")
 load(":version_from_filename.bzl", "version_from_filename")
 
-def parse_simpleapi_html(*, content):
+def parse_simpleapi_html(*, content, parse_index = False):
     """Get the package URLs for given shas by parsing the Simple API HTML.
 
     Args:
-        content(str): The Simple API HTML content.
+        content: {type}`str` The Simple API HTML content.
+        parse_index: {type}`bool` whether to parse the content as the index page of the PyPI index,
+            e.g. the `https://pypi.org/simple/`. This only has the URLs for the individual package.
 
     Returns:
-        A list of structs with:
+        If it is the index page, return the map of package to URL it can be queried from.
+        Otherwise, a list of structs with:
           * filename: {type}`str` The filename of the artifact.
           * version: {type}`str` The version of the artifact.
           * url: {type}`str` The URL to download the artifact.
@@ -59,32 +63,44 @@ def parse_simpleapi_html(*, content):
         # https://packaging.python.org/en/latest/specifications/simple-repository-api/#versioning-pypi-s-simple-api
         fail("Unsupported API version: {}".format(api_version))
 
+    packages = {}
+
     # 2. Iterate using find() to avoid huge list allocations from .split("<a ")
     cursor = 0
     for _ in range(1000000):  # Safety break for Starlark
         start_tag = content.find("<a ", cursor)
         if start_tag == -1:
             break
 
-        # Find the end of the opening tag and the closing </a>
-        tag_end = content.find(">", start_tag)
-        end_tag = content.find("</a>", tag_end)
-        if tag_end == -1 or end_tag == -1:
+        # Find the closing </a> tag first, then find the end of the opening
+        # <a ...> tag using rfind. This correctly handles attributes that
+        # contain > characters, e.g. data-requires-python=">=3.6".
+        end_tag = content.find("</a>", start_tag)
+        if end_tag == -1:
             break
+        tag_end = content.rfind(">", start_tag, end_tag)
+        if tag_end == -1 or tag_end <= start_tag:
+            cursor = end_tag + 4
+            continue
 
         # Extract only the necessary slices
-        attr_part = content[start_tag + 3:tag_end]
         filename = content[tag_end + 1:end_tag].strip()
+        attr_part = content[start_tag + 3:tag_end]
 
         # Update cursor for next iteration
         cursor = end_tag + 4
 
-        # 3. Efficient Attribute Parsing
         attrs = _parse_attrs(attr_part)
         href = attrs.get("href", "")
         if not href:
             continue
 
+        if parse_index:
+            pkg_name = filename
+            packages[normalize_name(pkg_name)] = href
+            continue
+
+        # 3. Efficient Attribute Parsing
         dist_url, _, sha256 = href.partition("#sha256=")
 
         # Handle Yanked status
@@ -121,6 +137,9 @@ def parse_simpleapi_html(*, content):
         else:
             sdists[sha256] = dist
 
+    if parse_index:
+        return packages
+
     return struct(
         sdists = sdists,
         whls = whls,
 
@@ -89,6 +89,11 @@ def _pypi_cache_get(self, key):
     if not cached and versions:
         # Could not get from in-memory, read from lockfile facts
         cached = self._facts.get(index_url, versions)
+    else:
+        # We might be using something from memory that is not yet stored in facts (e.g. we processed
+        # the requirements.txt for one Python version and the deps got cached, but new python
+        # version means different deps, which may add extras.
+        self._facts.setdefault(index_url, cached)
 
     return cached
 
@@ -122,6 +127,13 @@ def _filter_packages(dists, requested_versions):
     if dists == None or not requested_versions:
         return dists
 
+    if type(dists) == "dict":
+        return {
+            pkg: url
+            for pkg, url in dists.items()
+            if pkg in requested_versions
+        }
+
     sha256s_by_version = {}
     whls = {}
     sdists = {}
@@ -193,6 +205,12 @@ def _get_from_facts(facts, known_facts, index_url, requested_versions, facts_ver
         # cannot trust known facts, different version that we know how to parse
         return None
 
+    if type(requested_versions) == "dict":
+        return _filter_packages(
+            dists = known_facts.get("index_urls", {}).get(index_url, {}),
+            requested_versions = requested_versions,
+        )
+
     known_sources = {}
 
     root_url, _, distribution = index_url.rstrip("/").rpartition("/")
@@ -266,10 +284,46 @@ def _store_facts(facts, fact_version, index_url, value):
 
     facts["fact_version"] = fact_version
 
+    if type(value) == "dict":
+        # facts: {
+        #   "index_urls": {
+        #     "<index_url>": {
+        #       "<pkg_normalized>": "<dist_url>",
+        #     },
+        #   },
+        # },
+        for pkg, url in value.items():
+            facts.setdefault("index_urls", {}).setdefault(index_url, {})[pkg] = url
+        return value
+
     root_url, _, distribution = index_url.rstrip("/").rpartition("/")
     distribution = distribution.rstrip("/")
     root_url = root_url.rstrip("/")
 
+    # The schema is
+    # facts: {
+    #   "dist_hashes": {
+    #     "<index_url>": {
+    #       "<last segment>": {
+    #         "<dist url>": "<sha256>",
+    #       },
+    #     },
+    #   },
+    #   "dist_filenames": {
+    #     "<index_url>": {
+    #       "<last segment>": {
+    #         "<dist url>": "<filename>",   # if it is different from the URL
+    #       },
+    #     },
+    #   },
+    #   "dist_yanked": {
+    #     "<index_url>": {
+    #       "<last segment>": {
+    #         "<sha256>": "<reason>",   # if the package is yanked
+    #       },
+    #     },
+    #   },
+    # },
     for sha256, d in (value.sdists | value.whls).items():
         facts.setdefault("dist_hashes", {}).setdefault(root_url, {}).setdefault(distribution, {}).setdefault(d.url, sha256)
         if not d.url.endswith(d.filename):
Original file line number	Diff line number	Diff line change
`@@ -244,6 +244,7 @@ bzl_library(`
`244`	`244`	`srcs = ["parse_simpleapi_html.bzl"],`
`245`	`245`	`deps = [`
`246`	`246`	`":version_from_filename_bzl",`
	`247`	`+ "//python/private:normalize_name_bzl",`
`247`	`248`	`],`
`248`	`249`	`)`
`249`	`250`
`@@ -424,8 +425,6 @@ bzl_library(`
`424`	`425`	`":urllib_bzl",`
`425`	`426`	`"//python/private:auth_bzl",`
`426`	`427`	`"//python/private:normalize_name_bzl",`
`427`		`- "//python/private:text_util_bzl",`
`428`		`- "@bazel_features//:features",`
`429`	`428`	`],`
`430`	`429`	`)`
`431`	`430`