Skip to content

Commit 900d557

Browse files
authored
feat(pypi): first check index contents before downloading metadata about distributions (#3657)
The overall changes to the architecture are: * First check which packages are on which index. * Then write these details as facts for future reuse. * Then use the `index_url_overrides` to ensure that we are pulling things from the right index for the packages. * Then download everything. Notes on implementation: * This will pull index contents at most once for each index. * This will make the initial download times longer, but because we have `MODULE.bazel.lock` file and the facts written there, it should be OK. * This allows us to just parse the `index_url` and `extra_index_urls` from the lock files and use that without printing any warning messages. If we don't see any regressions in testing, I think this code path could be enabled for everyone by default. So `experimental_index_url` will no longer be experimental. I think this might have been the last thing holding us from flipping the switch. Fixes #2632 Fixes #3260
1 parent 293e643 commit 900d557

File tree

10 files changed

+401
-252
lines changed

10 files changed

+401
-252
lines changed

CHANGELOG.md

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -69,12 +69,18 @@ END_UNRELEASED_TEMPLATE
6969
Other changes:
7070
* (pypi) Update dependencies used for `compile_pip_requirements`, building
7171
sdists in the `whl_library` rule and fetching wheels using `pip`.
72-
* (pypi) We will set `allow_fail` to `False` if the
73-
{attr}`experimental_index_url_overrides` is set
74-
to a non-empty value. This means that failures will be no-longer cached in
75-
this particular case.
76-
([#3260](https://github.com/bazel-contrib/rules_python/issues/3260) and
77-
[#2632](https://github.com/bazel-contrib/rules_python/issues/2632))
72+
* (pypi) Before using the bazel downloader to fetch the PyPI package metadata
73+
we will from now on fetch the lists of available packages on each index. The
74+
used package mappings will be written as facts to the `MODULE.bazel.lock` file
75+
on supported bazel versions and it should be done at most once. As a result,
76+
per-package {obj}`experimental_index_url_overrides` is no longer needed if the index URLs are
77+
passed to the `pip.parse` via `experimental_index_url` and `experimental_extra_index_urls`.
78+
What is more, we start implementing the flags for `--index_url` and `--extra_index_urls` more in
79+
line to how it is used in `uv` and `pip`, i.e. we default to `--index_url` if the package is not
80+
found in `--extra_index_urls`.
81+
Fixes
82+
([#3260](https://github.com/bazel-contrib/rules_python/issues/3260) and
83+
[#2632](https://github.com/bazel-contrib/rules_python/issues/2632)).
7884

7985
{#v0-0-0-fixed}
8086
### Fixed

python/private/pypi/BUILD.bazel

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,7 @@ bzl_library(
244244
srcs = ["parse_simpleapi_html.bzl"],
245245
deps = [
246246
":version_from_filename_bzl",
247+
"//python/private:normalize_name_bzl",
247248
],
248249
)
249250

@@ -424,8 +425,6 @@ bzl_library(
424425
":urllib_bzl",
425426
"//python/private:auth_bzl",
426427
"//python/private:normalize_name_bzl",
427-
"//python/private:text_util_bzl",
428-
"@bazel_features//:features",
429428
],
430429
)
431430

python/private/pypi/parse_simpleapi_html.bzl

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,16 +16,20 @@
1616
Parse SimpleAPI HTML in Starlark.
1717
"""
1818

19+
load("//python/private:normalize_name.bzl", "normalize_name")
1920
load(":version_from_filename.bzl", "version_from_filename")
2021

21-
def parse_simpleapi_html(*, content):
22+
def parse_simpleapi_html(*, content, parse_index = False):
2223
"""Get the package URLs for given shas by parsing the Simple API HTML.
2324
2425
Args:
25-
content(str): The Simple API HTML content.
26+
content: {type}`str` The Simple API HTML content.
27+
parse_index: {type}`bool` whether to parse the content as the index page of the PyPI index,
28+
e.g. the `https://pypi.org/simple/`. This only has the URLs for the individual package.
2629
2730
Returns:
28-
A list of structs with:
31+
If it is the index page, return the map of package to URL it can be queried from.
32+
Otherwise, a list of structs with:
2933
* filename: {type}`str` The filename of the artifact.
3034
* version: {type}`str` The version of the artifact.
3135
* url: {type}`str` The URL to download the artifact.
@@ -59,32 +63,44 @@ def parse_simpleapi_html(*, content):
5963
# https://packaging.python.org/en/latest/specifications/simple-repository-api/#versioning-pypi-s-simple-api
6064
fail("Unsupported API version: {}".format(api_version))
6165

66+
packages = {}
67+
6268
# 2. Iterate using find() to avoid huge list allocations from .split("<a ")
6369
cursor = 0
6470
for _ in range(1000000): # Safety break for Starlark
6571
start_tag = content.find("<a ", cursor)
6672
if start_tag == -1:
6773
break
6874

69-
# Find the end of the opening tag and the closing </a>
70-
tag_end = content.find(">", start_tag)
71-
end_tag = content.find("</a>", tag_end)
72-
if tag_end == -1 or end_tag == -1:
75+
# Find the closing </a> tag first, then find the end of the opening
76+
# <a ...> tag using rfind. This correctly handles attributes that
77+
# contain > characters, e.g. data-requires-python=">=3.6".
78+
end_tag = content.find("</a>", start_tag)
79+
if end_tag == -1:
7380
break
81+
tag_end = content.rfind(">", start_tag, end_tag)
82+
if tag_end == -1 or tag_end <= start_tag:
83+
cursor = end_tag + 4
84+
continue
7485

7586
# Extract only the necessary slices
76-
attr_part = content[start_tag + 3:tag_end]
7787
filename = content[tag_end + 1:end_tag].strip()
88+
attr_part = content[start_tag + 3:tag_end]
7889

7990
# Update cursor for next iteration
8091
cursor = end_tag + 4
8192

82-
# 3. Efficient Attribute Parsing
8393
attrs = _parse_attrs(attr_part)
8494
href = attrs.get("href", "")
8595
if not href:
8696
continue
8797

98+
if parse_index:
99+
pkg_name = filename
100+
packages[normalize_name(pkg_name)] = href
101+
continue
102+
103+
# 3. Efficient Attribute Parsing
88104
dist_url, _, sha256 = href.partition("#sha256=")
89105

90106
# Handle Yanked status
@@ -121,6 +137,9 @@ def parse_simpleapi_html(*, content):
121137
else:
122138
sdists[sha256] = dist
123139

140+
if parse_index:
141+
return packages
142+
124143
return struct(
125144
sdists = sdists,
126145
whls = whls,

python/private/pypi/pypi_cache.bzl

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,11 @@ def _pypi_cache_get(self, key):
8989
if not cached and versions:
9090
# Could not get from in-memory, read from lockfile facts
9191
cached = self._facts.get(index_url, versions)
92+
else:
93+
# We might be using something from memory that is not yet stored in facts (e.g. we processed
94+
# the requirements.txt for one Python version and the deps got cached, but new python
95+
# version means different deps, which may add extras.
96+
self._facts.setdefault(index_url, cached)
9297

9398
return cached
9499

@@ -122,6 +127,13 @@ def _filter_packages(dists, requested_versions):
122127
if dists == None or not requested_versions:
123128
return dists
124129

130+
if type(dists) == "dict":
131+
return {
132+
pkg: url
133+
for pkg, url in dists.items()
134+
if pkg in requested_versions
135+
}
136+
125137
sha256s_by_version = {}
126138
whls = {}
127139
sdists = {}
@@ -193,6 +205,12 @@ def _get_from_facts(facts, known_facts, index_url, requested_versions, facts_ver
193205
# cannot trust known facts, different version that we know how to parse
194206
return None
195207

208+
if type(requested_versions) == "dict":
209+
return _filter_packages(
210+
dists = known_facts.get("index_urls", {}).get(index_url, {}),
211+
requested_versions = requested_versions,
212+
)
213+
196214
known_sources = {}
197215

198216
root_url, _, distribution = index_url.rstrip("/").rpartition("/")
@@ -266,10 +284,46 @@ def _store_facts(facts, fact_version, index_url, value):
266284

267285
facts["fact_version"] = fact_version
268286

287+
if type(value) == "dict":
288+
# facts: {
289+
# "index_urls": {
290+
# "<index_url>": {
291+
# "<pkg_normalized>": "<dist_url>",
292+
# },
293+
# },
294+
# },
295+
for pkg, url in value.items():
296+
facts.setdefault("index_urls", {}).setdefault(index_url, {})[pkg] = url
297+
return value
298+
269299
root_url, _, distribution = index_url.rstrip("/").rpartition("/")
270300
distribution = distribution.rstrip("/")
271301
root_url = root_url.rstrip("/")
272302

303+
# The schema is
304+
# facts: {
305+
# "dist_hashes": {
306+
# "<index_url>": {
307+
# "<last segment>": {
308+
# "<dist url>": "<sha256>",
309+
# },
310+
# },
311+
# },
312+
# "dist_filenames": {
313+
# "<index_url>": {
314+
# "<last segment>": {
315+
# "<dist url>": "<filename>", # if it is different from the URL
316+
# },
317+
# },
318+
# },
319+
# "dist_yanked": {
320+
# "<index_url>": {
321+
# "<last segment>": {
322+
# "<sha256>": "<reason>", # if the package is yanked
323+
# },
324+
# },
325+
# },
326+
# },
273327
for sha256, d in (value.sdists | value.whls).items():
274328
facts.setdefault("dist_hashes", {}).setdefault(root_url, {}).setdefault(distribution, {}).setdefault(d.url, sha256)
275329
if not d.url.endswith(d.filename):

0 commit comments

Comments
 (0)